
    .`iC                         d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZmZmZmZ d dlmZ d d	lmZmZmZmZ d d
lmZ  ee          Z G d de          Z G d de
          ZdS )    )AnyOptionalN)version)init_logger)
LinearBaseLinearMethodBase)QuantizationConfigQuantizationMethods)BITBLAS_OPTIMIZE_FEATURESBITBLAS_SUPPORTED_NUM_BITSBITBLAS_SUPPORTED_SYMMINIMUM_BITBLAS_VERSION)ParallelLMHead)BasevLLMParameterChannelQuantScaleParameterGroupQuantScaleParameterPackedvLLMParameter)set_weight_attrsc                       e Zd ZdZej        ZdZ eee          Z	dZ
dededz  dedz  dedz  d	edz  d
eddf fdZdefdZedefd            Zedeej                 fd            Zedefd            Zedee         fd            Ze	 ddeeef         dee         dedefd            Zedeeef         dd fd            Zededz  fd            Zdej        j        dede d         fdZ! xZ"S )BitBLASConfigzSConfig class for BitBLAS.

    Reference: https://github.com/Microsoft/BitBLAS
    int8	quantizedweight_bits
group_sizeNdesc_actis_symquant_methodlm_head_quantizedreturnc                 0   	 dd l }t          j        |j                  t          j        t                    k     rt          dt                     n0# t
          $ r#}|}	t          d|	 dt           d          |	d }~ww xY w|r|dk    rd}t                                                       || _	        || _
        || _        || _        || _        || _        | j	        t          vr t          d| j	         d	t           d
          | j        t           vr t          d| j         dt            d
          | j        }
t%          d                    d |
D                                 }|
| _        | j        | _        ||z  | _        || _        | j        | _        d S )Nr   z2bitblas version is wrong. Please install bitblas>=zQTrying to use the bitblas backend, but could not importwith the following error: zN. Please install bitblas through the following command: `pip install bitblas>=`Fz'BitBLAS does not support weight_bits = z. Only weight_bits = z are supported.z"BitBLAS does not support is_sym = z. Only sym =  c              3   B   K   | ]}|                                 |V  d S N)isdigit).0cs     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/bitblas.py	<genexpr>z)BitBLASConfig.__init__.<locals>.<genexpr>e   s/      "K"Kqyy{{"K1"K"K"K"K"K"K    )bitblasr   parse__version__r   ImportError
ValueErrorsuper__init__r   r   r   r   r   r   r   r   STORAGE_DTYPEintjoinstorage_dtypeTORCH_STORAGE_DTYPEstorage_torch_dtypepack_factornbits
ZEROS_MODE
zeros_mode)selfr   r   r   r   r   r   r,   ebitblas_import_exceptionr6   storage_nbit	__class__s               r)   r2   zBitBLASConfig.__init__-   s1   	,NNN}W011GM'5 5   "B(?B B    	, 	, 	,'($D-ED D *AD D D 
 ,,	,  	
b(( H&$ (!2 #===!$:J ! !&@! ! !   ;333ET[ E E3E E E  
 *277"K"Km"K"K"KKKLL*#'#; ';6 
 /s   AA 
B A;;B c                 X    d| j          d| j         d| j         d| j         d| j         dS )NzBitBLASConfig(weight_bits=z, group_size=z, desc_act=z	, is_sym=z, quant_method=))r   r   r   r   r   )r=   s    r)   __repr__zBitBLASConfig.__repr__p   sa    1)9 1 1/1 11 1 k1 1 !-	1 1 1	
r+   c                     dS )Nr,    clss    r)   get_namezBitBLASConfig.get_namey   s    yr+   c                 2    t           j        t           j        gS r%   )torchhalfbfloat16rG   s    r)   get_supported_act_dtypesz&BitBLASConfig.get_supported_act_dtypes}   s    
EN++r+   c                     dS )NF   rF   rG   s    r)   get_min_capabilityz BitBLASConfig.get_min_capability   s	     rr+   c                     dgS )Nzquantize_config.jsonrF   rG   s    r)   get_config_filenamesz"BitBLASConfig.get_config_filenames   s    &''r+   configkeysdefaultc                 ,    |D ]}|| v r
| |         c S |S )z1Get a value from the model's quantization config.rF   )rT   rU   rV   keys       r)   get_from_keyszBitBLASConfig.get_from_keys   s6    
  	# 	#Cf}}c{""" r+   c                 @   |                      |dg          }|                      |dgd          }|                      |dgd          }|                      |dgd          }|                      |dg          }|                     |dgd	          } | ||||||          S )
Nbitsr   r"   r   Fsymr   lm_head)rV   )rY   get_from_keys_or)rH   rT   r   r   r   r   r   r   s           r)   from_configzBitBLASConfig.from_config   s    ''99&&v~rBB
$$Vj\5AA""6E7E::((.1ABB00)e0TTsXv|EV
 
 	
r+   c                 \   |                     d          dk    p|                     dd          }|d u p|dk    p|dk    }|rj|rhd                    |                                 |                                           }t                              |           |                                 S d S )Ncheckpoint_formatr,   is_bitblas_formatFgptqz6The model is serialized in {} format. Using {} kernel.)getformatrI   loggerinfo)rH   hf_quant_cfg
user_quantrb   is_valid_user_quantmsgs         r)   override_quantization_methodz*BitBLASConfig.override_quantization_method   s     ),,
 
 G&**+>FF 	
 $Q*"6Q*	:Q 	  	"!4 	"JQQ C KK<<>>!tr+   layerprefixBitBLASLinearMethodc                     t          |t                    st          |t                    r| j        rt	          |           S d S r%   )
isinstancer   r   r   ro   )r=   rm   rn   s      r)   get_quant_methodzBitBLASConfig.get_quant_method   sH     eZ(( 	-un--	-262H	- 't,,,tr+   r%   )#__name__
__module____qualname____doc__rK   float16TORCH_DTYPEr3   getattrr7   r;   r4   boolstrr2   rD   classmethodr
   rI   listdtyperN   rQ   rS   staticmethoddictr   rY   r_   rl   nnModuler   rr   __classcell__)rA   s   @r)   r   r       s        
 -KM!'%77 JA*A* $JA* +	A*
 tA* DjA*  A* 
A* A* A* A* A* A*F
# 
 
 
 
 ,    [ ,ek): , , , [, 3    [ (T#Y ( ( ( [( @D S#X&*3i:=	   \ 	
c3h 	
O 	
 	
 	
 [	
 	t	#   [,X_.1	'	(       r+   r   c                      e Zd ZdZeZdZej        dej	        dej
        dej        dej        diZdefdZd	ej        j        d
edee         dededej        ddfdZd	ej        j        d
edee         dededej        fdZ	 ddZd Z	 dd	ej        j        dej        dej        dz  dej        fdZdededej        fdZdS )ro   zaLinear method for BitBLAS.

    Args:
        quant_config: The BitBLAS quantization config.
    Tfloat32rw   rM   r   quant_configc                     || _         d S r%   )r   )r=   r   s     r)   r2   zBitBLASLinearMethod.__init__   s    (r+   rm   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtyper   Nc           	         ~~|d         }|| j                                         vrt          d|           | j         j        }	|	d}	t	          |          }
|	dk    r||	z  dk    rt          d| d|	 d          |                     ||
|| j        d	d
| j         j                   t          t          j
        | j                                        d| j         j        d	          ddd| j         j        | j        j        r| j                                        d         nd|          }|	dk    rdn||	z  }t          j
        |
|d|          |d}|dk    rt!          dddi|}nt#          dddd|}| j         j        dk    rNt          t          j
        ||
| j         j        z  d| j         j        d	          ddd| j         j        |          }nBt'          t          j
        |
|d|          |          }t)          ||dk    rdnddd           |                    d|           |                    d|           |                    d|           dS )a  Creates quantized weights for use in linear operations.

        The function initializes and returns a dictionary containing quantized
        weights, scales, and zeros
        for performing quantized matrix multiplication operations.

        Args:
            input_size_per_partition: The size of the input partition.
            output_partition_sizes: List of output partition sizes.
            input_size: The total size of the input (unused).
            output_size: The total size of the output (unused).
            params_dtype:
                The data type of the parameters (expected to be torch.float16).

        Returns:
            A dictionary containing the quantized weights ('qweight'),
            scales ('scales'), and zeros ('zeros').

        Raises:
            ValueError: If `params_dtype` is not `torch.float16` or if the input
                size per partition is not divisible by the group size
                in `quant_config`.
        weight_loaderz3Parameter data type must be torch.float16, but got Nr"   r   zInput size per partition (z#) must be divisible by group size (z).Fnt)r   enable_tuningbiaslayoutr[   cuda)devicer~   requires_grad   )data	input_dim
output_dim
packed_dimpacked_factorbitblas_tile_sizer   )r   r~   )r   r   r   )r   r   r   )r   r   r   r   r   r   )r   )r   r   qweightscaleszerosrF   )r   rN   r0   r   sum_configure_bitblas_matmulENABLE_TUNINGr   r   rK   emptybitblas_matmulretrieve_weight_shaper8   r9   propagate_br   r   r<   r   r   register_parameter)r=   rm   r   r   r   r   r   extra_weight_attrsr   r   output_size_per_partitionr   input_groupsweight_scale_argsr   r   s                   r)   create_weights_gptqz'BitBLASLinearMethod.create_weights_gptq   sY   B *?;t0IIKKKKTlTT   &1
J$'(>$?$?! 8: E J J>-E > >/9> > >   	&&$%%,". 	' 	
 	
 	
 &#99;;';#	   +7 &2#99;;B??'!
 
 
( '",,qq2Jj2X K)"	   +
 
 1/RR1R@QRRFF-  -> F ';66'[ -1B1NN!+?"'   "/;+  EE  &- !&	   ,  E )5):):"#    	  G444  6222  %00000r+   c                     | j         j        dk    r | j        ||||||fi |S t          d| j         j                   Nrc   Unsupported quant_method )r   r   r   r0   )r=   rm   r   r   r   r   r   r   s           r)   create_weightsz"BitBLASLinearMethod.create_weightsg  sr     )V33+4+(&  %   LD,=,JLL  r+   c	                    ddl m}	 | j        |         }
d}d}| j        j        }| j        j        }| j        j        dk    rd}d}d| }| j        j        rd}d| }nt          d| j        j                    |	|||
|||
d	k    rd
n|
| j        j	        ||||||          }| 
                    ||          | _        d S )Nr   )MatmulConfigFrc   Tuintr4   r   r   int32)NKA_dtypeW_dtype	out_dtypeaccum_dtyper6   with_scaling
with_zerosr   	with_biasr   r<   )r,   r   BITBLAS_DTYPESr   r   r<   r   r   r0   r3   _get_or_create_bitblas_operatorr   )r=   
infeaturesoutfeaturesr   r   r   r   r[   r   r   bitblas_dtyper   r   r   r<   r   matmul_configs                    r)   r   z-BitBLASLinearMethod._configure_bitblas_matmul  s#    	)(((((+L9
&1
&1
)V33LJ#TmmG ' '"
&,,LD,=,JLL   %!#0F#:#:+9%!!!
 
 
 #BB=
 
r+   c                    ddl m}m} ddlm}m}  |            } |            }|                                dk    r|                    ||           |                    |          }	|	 |||d          }	|rd| d}
t          
                    |
           |	                    d	           |                    ||	           |                    ||           d| d
}t          
                    |           nAd| d}t          
                    |           n d| d}t          
                    |           |	S )Nr   )Matmulauto_detect_nvidia_target)get_database_pathglobal_operator_cacheF)targetr   zBitBLAS Operator z is tuning ...   )topkz tuned and saved to database.z	 created.z  found in global_operator_cache.)r,   r   r   bitblas.cacher   r   sizeload_from_databaserd   rf   rg   hardware_aware_finetuneaddsave_into_database)r=   rT   r   r   r   r   r   BITBLAS_DATABASE_PATHBITBLAS_TARGETr   TUNING_MESSAGETUNED_MESSAGE_messages                r)   r   z3BitBLASLinearMethod._get_or_create_bitblas_operator  s   ========JJJJJJJJ 1 1 3 32244 %%''1,,!44%~   /226::!#VF>QVWWWN &!KV!K!K!KN+++66B6???%))&.AAA%88)>   NMMM  M****@v@@@H%%%%S6SSSHKK!!!r+   xr   c                 z   |j         }|j        }|j        }|                    d|j        d                   }| j        j        r|                     |||          }n|                     ||||          }|                    |j        d d         |j        d         fz             }	||	                    |           |	S )Nr"   r   )	r   r   r   viewshaper   r   r   add_)
r=   rm   r   r   r   r   qzerosx_2d	output_2doutputs
             r)   
apply_gptqzBitBLASLinearMethod.apply_gptq  s     -vvb!'"+&&# 	K++D'6BBII++D'66JJI	0B/D DEEKKr+   argskwargsc                 t    | j         j        dk    r | j        |i |S t          d| j         j                   r   )r   r   r   r0   )r=   r   r   s      r)   applyzBitBLASLinearMethod.apply  sO    
 )V33"4?D3F333LD,=,JLL  r+   )rw   r%   )rs   rt   ru   rv   r   OPT_FEATURESr   rK   r   rw   rM   rL   r   r   r   r2   r   r   r4   r}   r~   r   r   r   r   Tensorr   r   r   rF   r+   r)   ro   ro      s         -LMyy

I
FN)] ) ) ) )L1xL1 #&L1 !%S		L1
 L1 L1 kL1 
L1 L1 L1 L1\x #& !%S		
   k   D 0
 0
 0
 0
d     L %)	 x < lT!	
 
   0

 
 
	
 
 
 
 
 
r+   ro   ) typingr   r   rK   	packagingr   vllm.loggerr   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr	   r
   ;vllm.model_executor.layers.quantization.utils.bitblas_utilsr   r   r   r   3vllm.model_executor.layers.vocab_parallel_embeddingr   vllm.model_executor.parameterr   r   r   r   vllm.model_executor.utilsr   rs   rf   r   ro   rF   r+   r)   <module>r      s   !                      # # # # # # J J J J J J J J                   O N N N N N            7 6 6 6 6 6	X		^ ^ ^ ^ ^& ^ ^ ^Bu u u u u* u u u u ur+   