
    .`i9                     f   d dl Z d dl mZ d dlmZ d dlmZmZmZ d dlZd dl	m
Z d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% erd dl&m'Z' d dl(m)Z) ne*Z' ee+          Z, G d de          Z- G d de          Z. G d de          Z/dS )    N)Enum)Fraction)TYPE_CHECKINGAnyUnion)_TYPES)	Parameter)_custom_ops)init_logger)FusedMoE)LinearMethodBase)QuantizationConfigQuantizeMethodBase)get_linear_quant_method)ChannelQuantScaleParameterGroupQuantScaleParameterPackedColumnParameterPackedvLLMParameterRowvLLMParameter)get_safetensors_params_metadata)
is_list_of)QuantizationMethods)WeightsMapperc                       e Zd ZdZ	 	 	 ddededededeeeeeez  f         f         d	ed
ee         dz  deddf fdZ	defdZ
edefd            Zedeej                 fd            Zedefd            Zedee         fd            Zedeeef         dd fd            Zdej        j        deded         dz  fdZd dZd!dededz  fdZ xZS )"
GPTQConfigzLConfig class for GPTQ.

    Reference: https://arxiv.org/abs/2210.17323
     Nweight_bits
group_sizedesc_actlm_head_quantizeddynamicautoround_versionmodules_in_block_to_quantizecheckpoint_formatreturnc	                 |   t                                                       || _        || _        || _        || _        || _        t          d| j                  | _        | j        dvrt          d| j         d          | j        dk    rt                              d           |pg | _        || _        || _        d S )N    )            zOCurrently, only 2/3/4/8-bit weight quantization is supported for GPTQ, but got z bits.r*   zfCurrently, the 4-bit gptq_gemm kernel for GPTQ is buggy. Please switch to gptq_marlin or gptq_bitblas.)super__init__r!   r   r   r   r    r   pack_factor
ValueErrorloggerwarning_oncer#   r"   r$   )
selfr   r   r   r    r!   r"   r#   r$   	__class__s
            /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/gptq.pyr-   zGPTQConfig.__init__1   s    D 	&$ !2#B(899<//H/3/?H H H   q  @  
 -I,NB) "3
 "3    c                 x    d| j          d| j         d| j         d| j         d| j         d| j         d| j         dS )	NzGPTQConfig(weight_bits=z, group_size=z, desc_act=z), lm_head_quantized=z
, dynamic=z, modules_in_block_to_quantize=z), checkpoint_format=))r   r   r   r    r!   r#   r$   )r2   s    r4   __repr__zGPTQConfig.__repr__r   s    ;d&6 ; ;/; ;; ; "&!7; ; |	; ;
 -1,M; ; "&!7; ; ;	
r5   c                     dS )Ngptq clss    r4   get_namezGPTQConfig.get_name}   s    vr5   c                     t           j        gS N)torchhalfr<   s    r4   get_supported_act_dtypesz#GPTQConfig.get_supported_act_dtypes   s    
|r5   c                     dS )N<   r;   r<   s    r4   get_min_capabilityzGPTQConfig.get_min_capability   s	     rr5   c                     dgS )Nzquantize_config.jsonr;   r<   s    r4   get_config_filenameszGPTQConfig.get_config_filenames   s    &''r5   configc           
         |                      |dgi           }|i n|}|                     |dg          }|                     |dg          }|                     |dg          }|                      |dgd          }|                      |dgd	          }|                      |d
gd           }|                      |dgd	          }	 | ||||||||	          S )Nr!   )defaultbitsr   r   lm_headFr"   r   r#   r$   )get_from_keys_orget_from_keys)
r=   rI   r!   r   r   r   r    r"   r#   r$   s
             r4   from_configzGPTQConfig.from_config   s1   &&v	{B&GG""W''99&&v~>>
$$Vj\::00)e0TT00()2 1 
 
 (+';';34d (< (
 (
$  00()2 1 
 
 s(	
 	
 		
r5   layerprefix)GPTQLinearMethodr   c                     t          |t                    rAddlm} d| j        | j        ddd}|                    |                              ||          S t          | ||t                    S )N   )MoeWNA16Configr:   TF)quant_methodrL   r   symrM   )

isinstancer   	moe_wna16rV   r   r   rP   get_quant_methodr   rS   )r2   rQ   rR   rV   rI   s        r4   r[   zGPTQConfig.get_quant_method   s     eX&& 	V111111 !'("o  F "--f55FFufUUU&tUF<LMMMr5   hf_to_vllm_mapperr   c                 V    | j         !|                    | j                   | _         d S d S r@   )r#   
apply_list)r2   r\   s     r4   apply_vllm_mapperzGPTQConfig.apply_vllm_mapper   s7    ,80A0L0L11 1D--- 98r5   
model_namerevisionc                 N   | j         r2t          | j         t                    rd | j         D             | _         d S t          j        t          j        t          j        gt          ||          }fd|                                D             }t          |          | _         d S )Nc                     g | ]	}|D ]}|
S r;   r;   ).0sublistitems      r4   
<listcomp>z2GPTQConfig.maybe_update_config.<locals>.<listcomp>   s?     5 5 5 '5 5  5 5 5 5r5   )ra   c                     h | ]H\  }}|                     d d          xr+t                   v,|                    dd          d         IS )dtypeN.rU   r   )get_SAFETENSORS_TO_TORCH_DTYPErsplit)rd   
param_nameinfori   unquant_dtypess      r4   	<setcomp>z1GPTQConfig.maybe_update_config.<locals>.<setcomp>   sl     "
 "
 "
 
D'4000"
 ,E2.HH c1%%a( IHHr5   )	r#   r   listrA   float16bfloat16float32r   items)r2   r`   ra   metadataquant_layersri   rp   s        @@r4   maybe_update_configzGPTQConfig.maybe_update_config   s    , 		$;TBB 5 5#'#D5 5 51
 F-G2:QQQ"
 "
 "
 "
 "
$,NN$4$4"
 "
 "
 -1,>,>)))r5   )r   Nr   )r\   r   r@   )__name__
__module____qualname____doc__intbooldictstrrr   r-   r8   classmethodr   r>   rA   ri   rC   rF   rH   r   rP   nnModuler   r[   r_   ry   __classcell__)r3   s   @r4   r   r   +   sJ         "$9=!#?3 ?3?3 ?3 	?3
  ?3 c4S4Z001?3 ?3 '+3i$&6?3 ?3 
?3 ?3 ?3 ?3 ?3 ?3B	
# 	
 	
 	
 	
 ,    [ ek):    [ 3    [ (T#Y ( ( ( [( 
c3h 
L 
 
 
 [
8NX_N.1N	7	84	?N N N N&   ? ?c ?S4Z ? ? ? ? ? ? ? ?r5   r   c                   h    e Zd Z ej                    Z ej                    Z ej                    ZdS )ExllamaStateN)rz   r{   r|   enumautoUNUSEDUNINITIALIZEDREADYr;   r5   r4   r   r      s5        TY[[FDIKKMDIKKEEEr5   r   c                       e Zd ZdZdefdZdej        j        de	de
e	         de	de	d	ej        fd
Zdej        j        ddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )rS   z[Linear method for GPTQ.

    Args:
        quant_config: The GPTQ quantization config.
    quant_configc                 4    || _         |j        dk    | _        d S )Ngptq_v2)r   r$   use_v2_format)r2   r   s     r4   r-   zGPTQLinearMethod.__init__   s"    ( *;yHr5   rQ   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                     ~|                     d          }| j        j        z  dk    rt          d          t	          |          }	|	 j        j        j        z  dk    rt          d           j        j        dk    r j        j        }
n|}
t          j        }||
z  }d }||k    r0 j        j        dk    r  j        j	        rt          j
        }n||
z  }d}t          t          j        | j        j        z  |	t          j                  ddd j        j        |          }t          t          j         fd	t#          |          D             t          j                  d|
          }t          j        ||	 j        j        z  t          j                  |d}t          j        ||	|          |d}|)t%          dddi|}t'          ddd j        j        d|}n*t)          dddd|}t          dddd j        j        d|}|                    d|           |                    d|           |                    d|           |                    d|           ||_        d S )Nweight_loaderr   ztThe input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.zuThe output size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.)ri   rU   )data	input_dim
output_dim
packed_dimpacked_factorr   c                 .    g | ]}|j         j        z  S r;   )r   r   )rd   ir2   s     r4   rg   z3GPTQLinearMethod.create_weights.<locals>.<listcomp>*  s4        *55  r5   )r   r   r   )r   r   r   )r   r   r   )r   r   )r   r   r   r   qweightg_idxqzerosscalesr;   )rk   r   r   r/   sumr.   	numeratorr   r   r   r   r   rA   emptyint32r   tensorranger   r   r   register_parameterexllama_state)r2   rQ   r   r   r   r   r   extra_weight_attrsr   output_size_per_partitionr   r   scale_and_zero_sizescale_and_zero_input_dimr   r   qzeros_argsweight_scale_argsr   r   s   `                   r4   create_weightszGPTQLinearMethod.create_weights   sY    *..??#d&7&BBaGG(  
 %((>$?$?!$t'8'D'NNRSSS(   '2--*5JJ#J$2(J6#' 222!,22  ) - , 3 '?*&L#+,(%(D,=,II)k  
 +7'
 
 
 !   "#;<<   k   '

 

 

 K#)T->-JJk  
 +
 
 K#)"  
 +
 
 $+/RR1R@QRRF* "/;  	 FF .  -> F ) "/;	 
  F 	  G444  %000  6222  6222+r5   r%   Nc                    t          |j        j        d          |_        t          |j        j        d          |_        t          |j        j        d          |_        t          |j        j        d          |_        |j        t          j        k    r| j	        j
        rAt          j        |j                                      t          j                  |j        _        n5t          j        dt          j        |j        j                  |j        _        t          j        |_        t%          j        |j        |j        | j	        j                   d S d S )NF)requires_grad)r   )ri   device)r	   r   r   r   r   r   r   r   r   r   r   rA   argsorttor~   r   r   r   opsgptq_shuffler   )r2   rQ   s     r4   process_weights_after_loadingz.GPTQLinearMethod.process_weights_after_loading_  s    !2%HHH!%-"4EJJJ 0FFF !2%HHH ,"<<< ) #(=#=#=#@#@#K#K  #(;	%+2D$ $ $  #/"4EU]EK9J9VWWWWW =<r5   xbiasc           
         |j         d d         |j        j         d         fz   }|                    d|j         d                   }t          j        ||j        |j        |j        |j        |j        t          j
        k    | j        | j        j                  }||                    |           |                    |          S )Nr   )shaper   reshaper   	gptq_gemmr   r   r   r   r   r   r   r   r   add_)r2   rQ   r   r   	out_shape
reshaped_xoutputs          r4   applyzGPTQLinearMethod.applyr  s     GCRCLEM$7$;#==	YYr172;//
 MLLK<#55)	
 	
 KK~~i(((r5   r@   )rz   r{   r|   r}   r   r-   rA   r   r   r~   rr   ri   r   r   Tensorr   r;   r5   r4   rS   rS      s         IZ I I I Io,xo, #&o, !%S		o,
 o, o, ko, o, o, o,bX58? Xt X X X X. %)	) )x) <) lT!	)
 
) ) ) ) ) )r5   rS   )0r   r   	fractionsr   typingr   r   r   rA   safetensors.torchr   rl   torch.nn.parameterr	   vllmr
   r   vllm.loggerr   *vllm.model_executor.layers.fused_moe.layerr   !vllm.model_executor.layers.linearr   3vllm.model_executor.layers.quantization.base_configr   r   8vllm.model_executor.layers.quantization.utils.gptq_utilsr   vllm.model_executor.parameterr   r   r   r   r   vllm.transformers_utils.configr   vllm.utils.collection_utilsr   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.utilsr   r   rz   r0   r   r   rS   r;   r5   r4   <module>r      s[                , , , , , , , , , ,  C C C C C C ( ( ( ( ( ( # # # # # # # # # # # # ? ? ? ? ? ? > > > > > >                          K J J J J J 2 2 2 2 2 2 KKKKKK>>>>>>>	X		m? m? m? m? m?# m? m? m?`    4   h) h) h) h) h)' h) h) h) h) h)r5   