
    .`iW'                        d dl mZmZmZ d dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ erd dlmZ d dlm Z   ee!          Z" G d de          Z# G d de          Z$dS )    )TYPE_CHECKINGAnyUnionN)_TYPES)_custom_ops)init_logger)FusedMoE)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationConfigQuantizeMethodBase)is_layer_skipped)GroupQuantScaleParameterPackedvLLMParameter)get_safetensors_params_metadata)QuantizationMethods)WeightsMapperc                   R    e Zd ZdZ	 ddedededee         dz  ddf
 fdZdefd	Z	ddZ
deej                 fdZedefd            Zedee         fd            Zedeeef         dd fd            Zdej        j        deded         dz  fdZddZddededz  fdZ xZS )	AWQConfigzKConfig class for AWQ.

    Reference: https://arxiv.org/abs/2306.00978
    Nweight_bits
group_size
zero_pointmodules_to_not_convertreturnc                     t                                                       || _        || _        || _        |pg | _        | j        dk    rt          d| j         d          d| j        z  | _        d S )N   zHCurrently, only 4-bit weight quantization is supported for AWQ, but got z bits.    )super__init__r   r   r   r   
ValueErrorpack_factor)selfr   r   r   r   	__class__s        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/awq.pyr    zAWQConfig.__init__&   s     	&$$&<&B#q  9 $ 09 9 9   !11    c           	      H    d| j          d| j         d| j         d| j         d	S )NzAWQConfig(weight_bits=z, group_size=z, zero_point=z, modules_to_not_convert=))r   r   r   r   r#   s    r%   __repr__zAWQConfig.__repr__:   sZ    ET%5 E E/E E/E E '+&AE E E	
r&   r   c                     dS )Nawq r)   s    r%   get_namezAWQConfig.get_nameB   s    ur&   c                     t           j        gS N)torchhalfr)   s    r%   get_supported_act_dtypesz"AWQConfig.get_supported_act_dtypesE   s    
|r&   c                     dS )NK   r-   )clss    r%   get_min_capabilityzAWQConfig.get_min_capabilityH   s	     rr&   c                  
    ddgS )Nzquant_config.jsonzquantize_config.jsonr-   r-   r&   r%   get_config_filenameszAWQConfig.get_config_filenamesM   s      "
 	
r&   configc                     |                      |ddg          }|                      |ddg          }|                      |dg          }|                     |dgd           } | ||||          S )Nw_bitbitsq_group_sizer   r   r   )get_from_keysget_from_keys_or)r6   r:   r   r   r   r   s         r%   from_configzAWQConfig.from_configU   s    ''&0ABB&&v/MNN
&&v~>>
!$!5!5-."
 "
 s;
J8NOOOr&   layerprefix)r   r   c                 l   t          |t                    r:t          || j        | j        d          rt                      S t          |           S t          |t                    rddlm	} ddl
m} ddlm}  ||| j                  sdt                              d| d           d	| j        | j        | j        d
| j        d}|                    |                              ||          S d	| j        | j        | j        d
| j        d}|                    |          }|                    ||          S d S )NT)skip_with_substr   )AWQMarlinConfig)MoeWNA16Config)check_moe_marlin_supports_layerzLayer 'zF' is not supported by AWQMoeMarlin. Falling back to Moe WNA16 kernels.r,   F)quant_methodr=   r   r   lm_headr   )
isinstancer
   r   r   packed_modules_mappingr   AWQLinearMethodr	   
awq_marlinrG   	moe_wna16rH   utils.marlin_utilsrI   r   loggerwarning_oncer   r   rA   get_quant_method)	r#   rB   rC   rG   rH   rI   r:   marlin_compatible_config_dictawq_marlin_configs	            r%   rT   zAWQConfig.get_quant_method_   s    eZ(( *	E++!%	   1 /000"4(((x(( !	E333333111111KKKKKK225$/JJ ##9f 9 9 9  
 %* ,"&/"&/$.2.I  &11&99JJ6   !&("o"o *.*E- -) !0 ; ;-! ! %55eVDDDtr&   hf_to_vllm_mapperr   c                 V    | j         r!|                    | j                   | _         d S d S r0   )r   
apply_list)r#   rW   s     r%   apply_vllm_mapperzAWQConfig.apply_vllm_mapper   s;    & 	*;*F*F++ +D'''	 	r&   
model_namerevisionc                    | j         rd S t          j        t          j        t          j        gt          ||          }d |D             }fd|                                D             }t          ||z
            | _         d S )N)r\   c                 F    h | ]}|                     d d          d         S ).rF   r   )rsplit).0
param_names     r%   	<setcomp>z0AWQConfig.maybe_update_config.<locals>.<setcomp>   s-    JJJ:*##C++A.JJJr&   c                     h | ]H\  }}|                     d d          xr+t                   v,|                    dd          d         IS )dtypeNr_   rF   r   )get_SAFETENSORS_TO_TORCH_DTYPEr`   )ra   rb   infore   unquant_dtypess      r%   rc   z0AWQConfig.maybe_update_config.<locals>.<setcomp>   sl     "
 "
 "
 
D'4000"
 ,E2.HH c1%%a( IHHr&   )r   r1   float16bfloat16float32r   itemslist)r#   r[   r\   metadatalayersquant_layersre   ri   s         @@r%   maybe_update_configzAWQConfig.maybe_update_config   s    & 	F-G2:QQQJJJJJ"
 "
 "
 "
 "
$,NN$4$4"
 "
 "
 '+6L+@&A&A###r&   r0   )r   r   )rW   r   )__name__
__module____qualname____doc__intboolrn   strr    r*   r.   r1   re   r3   classmethodr7   staticmethodr9   dictr   rA   nnModuler   rT   rZ   rr   __classcell__)r$   s   @r%   r   r       s         482 22 2 	2
 !%S	D 02 
2 2 2 2 2 2(
# 
 
 
 
   $u{*;     3    [ 
$s) 
 
 
 \
 Pc3h PK P P P [P.X_..1.	7	84	?. . . .`   B Bc BS4Z B B B B B B B Br&   r   c                       e Zd ZdZdefdZdej        j        de	de
e	         de	de	d	ej        fd
Zdej        j        ddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )rN   zYLinear method for AWQ.

    Args:
        quant_config: The AWQ quantization config.
    quant_configc                     || _         d S r0   )r   )r#   r   s     r%   r    zAWQLinearMethod.__init__   s    (r&   rB   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 
   | j         j        dk    r| j         j        }n|}||z  dk    rt          d          t          |          }	|	| j         j        z  dk    rt          d          |                    d          }
t          t          j        ||	| j         j        z  t          j	                  ddd| j         j        |
          }||z  }t          t          j        ||	| j         j        z  t          j	                  ddd| j         j        |
          }t          t          j        ||	|          dd|
	          }|                    d
|           |                    d|           |                    d|           d S )Nr   ztThe input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.zuThe output size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.weight_loader)re   rF   )data	input_dim
output_dim
packed_dimpacked_factorr   )r   r   r   r   qweightqzerosscales)r   r   r!   sumr"   rf   r   r1   emptyint32r   register_parameter)r#   rB   r   r   r   r   r   extra_weight_attrsr   output_size_per_partitionr   r   
num_groupsr   r   s                  r%   create_weightszAWQLinearMethod.create_weights   s    '2--*5JJ#J#j0A55(   %((>$?$?!$t'8'DDII(   +..??%()T->-JJk  
 +7'
 
 
 .;
$)T->-JJk  
 +7'
 
 
 *)"  
 '	
 	
 	
 	  G444  6222  622222r&   r   Nc                 &   t           j                            |j        j        d          |_        t           j                            |j        j        d          |_        t           j                            |j        j        d          |_        d S )NF)requires_grad)r1   r}   	Parameterr   r   r   r   )r#   rB   s     r%   process_weights_after_loadingz-AWQLinearMethod.process_weights_after_loading   si    **5=+=U*SSx))%,*;5)QQx))%,*;5)QQr&   xbiasc                    |j         }|j        }|j        }| j        j        }|j        d d         |j        d         |z  fz   }|                    d|j        d                   }	|j        d d                                         dk    }
|
r/t          j	        |||ddd          }t          j        |	|          }nt          j        |	||||          }||                    |           |                    |          S )Nr      r   )r   r   r   r   r"   shapereshapenumelopsawq_dequantizer1   matmulawq_gemmadd_)r#   rB   r   r   r   r   r   r"   	out_shape
reshaped_xFP16_MATMUL_HEURISTIC_CONDITIONouts               r%   applyzAWQLinearMethod.apply   s     -'3GCRCLGM"$5$C#EE	YYr172;//
 +,'#2#,*<*<*>*>#*E'* 	Q$WffaAFFC,z3//CC,z7FFKPPCHHTNNN{{9%%%r&   r0   )rs   rt   ru   rv   r   r    r1   r}   r~   rw   rn   re   r   r   Tensorr   r-   r&   r%   rN   rN      s        )Y ) ) ) )I3xI3 #&I3 !%S		I3
 I3 I3 kI3 I3 I3 I3VR58? Rt R R R R %)	& &x& <& lT!	&
 
& & & & & &r&   rN   )%typingr   r   r   r1   safetensors.torchr   rg   vllmr   r   vllm.loggerr   *vllm.model_executor.layers.fused_moe.layerr	   !vllm.model_executor.layers.linearr
   r   r   3vllm.model_executor.layers.quantization.base_configr   r   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.model_executor.parameterr   r   vllm.transformers_utils.configr   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.utilsr   rs   rR   r   rN   r-   r&   r%   <module>r      s   - , , , , , , , , ,  C C C C C C # # # # # # # # # # # # ? ? ? ? ? ?         
        W V V V V V W W W W W W W W J J J J J J ?KKKKKK>>>>>>	X		BB BB BB BB BB" BB BB BBJq& q& q& q& q&& q& q& q& q& q&r&   