
    .`iZ                     D   d dl mZ d dlmZ d dlmZmZ d dlZd dlZd dlm	Z
 d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZmZmZ d dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,  ee-          Z. G d de           Z/ ei           fde0de1e0         dee0e1e0         f         fdZ2e
j3        e
j4        e
j5        hZ6e
j7        e
j8        e
j9        e
j:        e
j;        e
j<        hZ=e
j>        e
j?        e
j@        e
jA        e
jB        hZCe
jD        e
jE        e
jF        e
jG        e
jH        e
jI        e
jJ        e
jK        e
jL        h	ZMe=eCz  eMz  ZNe=eCz  eMz  ZOe=eCz  ZPdejQ        dejQ        deRdejQ        fdZSdejQ        dejQ        deRdejQ        fdZT	  e,deSeT            ej        j        jS        ZUn# eV$ rZWeWdZW[Www xY wdejQ        d!ejQ        d"ejQ        d#ejQ        d$ejQ        deRd%eRd&e0dejQ        fd'ZXdejQ        d!ejQ        d"ejQ        d#ejQ        d$ejQ        deRd%eRd&e0dejQ        fd(ZY	  e,d)eXeY            ej        j        jX        ZZn# eV$ rZWeWdZW[Www xY w	 d7dejQ        dejQ        deRd*eRd+ej[        dz  dejQ        fd,Z\	 d7dejQ        dejQ        deRd*eRd+ej[        dz  dejQ        fd-Z]	  e,d.e\e]            ej        j        j\        Z^n# eV$ rZWeWdZW[Www xY w G d/ d0e          Z_ G d1 d2e          Z` G d3 d4e_          Za G d5 d6e          ZbdS )8    )Mapping)MappingProxyType)AnyOptionalN)GGMLQuantizationType)	ParameterUninitializedParameter)_custom_ops)init_logger)FusedMoEConfigFusedMoEQuantConfig)FusedMoEFusedMoEMethodBase)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)UnquantizedEmbeddingMethodVocabParallelEmbedding)WeightsMapper)set_weight_attrs)current_platform)direct_register_custom_opc                   *    e Zd ZdZddee         dz  ddf fdZdefdZdefdZ	dee
j                 fdZedefd	            Zedee         fd
            Zedeeef         dd fd            Zde
j        j        deded         fdZddZ xZS )
GGUFConfigzConfig class for GGUF.Nunquantized_modulesreturnc                 Z    t                                                       |pg | _        d S N)super__init__r   )selfr   	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/gguf.pyr#   zGGUFConfig.__init__0   s,    #6#<"       c                     dS )NzGGUFConfig() r$   s    r&   __repr__zGGUFConfig.__repr__4   s    ~r'   c                     dS )Nggufr)   r*   s    r&   get_namezGGUFConfig.get_name7   s    vr'   c                     t          j        d          r2t                              d           t          j        t          j        gS t          j        t          j        t          j        gS )Nd   z5GGUF has precision issues with bfloat16 on Blackwell.)r   has_device_capabilityloggerwarning_oncetorchhalffloat32bfloat16r*   s    r&   get_supported_act_dtypesz#GGUFConfig.get_supported_act_dtypes:   sN     1#66 	/ WXXXJ..
ENEM::r'   c                     dS )N<   r)   clss    r&   get_min_capabilityzGGUFConfig.get_min_capabilityB   s    rr'   c                     g S r!   r)   r;   s    r&   get_config_filenameszGGUFConfig.get_config_filenamesF   s    	r'   configc                      |             S r!   r)   )r<   r@   s     r&   from_configzGGUFConfig.from_configJ   s    suur'   layerprefixr   c                    t          |t                    r8t          || j        | j                  rt                      S t          |           S t          |t                    r8t          || j        | j                  rt                      S t          |           S t          |t                    rt          | |j                  S d S r!   )
isinstancer   is_layer_skipped_ggufr   packed_modules_mappingr   GGUFLinearMethodr   r   GGUFEmbeddingMethodr   GGUFMoEMethod
moe_config)r$   rC   rD   s      r&   get_quant_methodzGGUFConfig.get_quant_methodN   s     eZ(( 	9$0$2M  1 /000#D)))566 	9$0$2M  4 2333&t,,,x(( 	9 u'7888tr'   hf_to_vllm_mapperr   c                 V    | j         !|                    | j                   | _         dS dS )a   
        Interface for models to update module names referenced in
        quantization configs in order to reflect the vllm model structure

        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
            structure of the qconfig) to vllm model structure
        N)r   
apply_list)r$   rN   s     r&   apply_vllm_mapperzGGUFConfig.apply_vllm_mapperb   s9     #/'8'C'C(( (D$$$ 0/r'   r!   )rN   r   )__name__
__module____qualname____doc__liststrr#   r+   r   r.   r4   dtyper8   classmethodintr=   r?   dictr   rB   nnModuler   rM   rQ   __classcell__r%   s   @r&   r   r   -   s         = =DI,< = = = = = = =#    -    ;$u{*; ; ; ; ; 3    [ T#Y    [ c3h L    [X_.1	&	'   (       r'   r   rD   r   fused_mappingc                 4                          d          d         |v rV fd|         D             }d }|D ];t          fd|D                       }||}"||k    rt          d  d          <nt           fd|D                       }|J |S )N.c                 <    g | ]}                     |          S r)   )replace).0shard_proj_namerD   	proj_names     r&   
<listcomp>z)is_layer_skipped_gguf.<locals>.<listcomp>{   s7     
 
 
 NN9o66
 
 
r'   c              3       K   | ]}|v V  	d S r!   r)   )rf   module_nameshard_prefixs     r&   	<genexpr>z(is_layer_skipped_gguf.<locals>.<genexpr>   s9       # #0;+# # # # # #r'   z$Detected some but not all shards of zF are quantized. All shards of fused layers to have the same precision.c              3       K   | ]}|v V  	d S r!   r)   )rf   rk   rD   s     r&   rm   z(is_layer_skipped_gguf.<locals>.<genexpr>   s(      VV;.VVVVVVr'   )splitany
ValueError)rD   r   r`   shard_prefixes
is_skippedis_shard_skippedrh   rl   s   `     @@r&   rG   rG   p   s)    S!!"%IM!!
 
 
 
 
#0#;
 
 

 
* 	 	L" # # # #?R# # #     !-

!Z// 26 2 2 2   0	 VVVVBUVVVVV
!!!r'   xqweightqweight_typer   c                     |t           v r|j        d         dk    rdnd}n|j        d         dk    rdnd}| j        d         dk    r8t          j        | j        d         |j        d         | j        | j                  S |t          v r
| |j        z  S | j        d         |k    r,|t          v r#t          j
        || ||j        d                   }n|t          v r#t          j        || ||j        d                   }n|t          v rZt          j        |         \  }}|j        d         |j        d         |z  |z  f}t          j        ||g|| j        R  }| |j        z  }n!t#          |          }t%          d	|           |S )
Nr   i               rX   device   $Unsupported GGUF quantization type: )IMATRIX_QUANT_TYPESshaper4   emptyrX   r~   UNQUANTIZED_TYPESTMMVQ_QUANT_TYPESopsggml_mul_mat_vec_a8MMQ_QUANT_TYPESggml_mul_mat_a8DEQUANT_TYPESr-   GGML_QUANT_SIZESggml_dequantize
WeightTypeNotImplementedError)	ru   rv   rw   	mmvq_safey
block_size	type_sizer   weights	            r&   _fused_mul_mat_ggufr      s    *** q)D00AAb		 q)D00AAa	 	wqzQ{171:w}Q'7qwqxXXXX(((79}wqzY<3C#C#C#GQgmA>NOO		(	(L'-:JKK		&	& $ 5l C
Iq!7=#3y#@:#MN$WlLULAGLLLL
 ",//!"W"W"WXXXHr'   c                 r    t          j        | j        d         |j        d         | j        | j                  S Nr   r}   )r4   r   r   rX   r~   )ru   rv   rw   s      r&   _fused_mul_mat_gguf_faker      s.    
 ;qwqz7=#31718TTTTr'   r   )op_nameop_func	fake_implw1w2topk_weightstopk_idsqweight_type2
activationc                 B   dt           j        ffd}ddlm}	 t          j        |           }
|t
          v r|t
          v r
| j        d         dk    r| j        \  }}|j        \  }}}|j        d         }t          j        |          } |	|||          \  }}}t          j	        | ||||||||	  	        } ||          }t          j	        |||||||j        d         d||z  	  	        }|
                    |||j        d                                       |                    ||d                    }t          j        ||
           n|t          v r|t          v r| j        \  }}|j        \  }}}|j        d         }t          j        | ||||||          } ||          }t          j        |||d||j        d         ||z            }|
                    |||j        d                                       |                    ||d                    }t          j        ||
           nt                               d           t%          t'          ||                    D ]\  }\  }}| |         
                    d| j        dd          z             }d }t'          ||          D ]o\  }}||         }t)          |||          } ||          }||         }t)          |||                              |          }||}Z|                    |           p||
|<   |
S )	Nru   c                 d   | j         d         dz  }| j         d d         |fz   }t          j        || j        | j                  }dk    r&t          j        j                            ||            n>dk    r&t          j        j                            ||            nt          d           |S )Nrc   r{   r}   silugeluzUnsupported activation: )
r   r4   r   rX   r~   r   _Csilu_and_mulgelu_and_mulrq   )ru   doutput_shapeoutr   s       r&   actz_fused_moe_gguf.<locals>.act   s    GBK1wss|qd*k,agahGGGIL%%c1----6!!IL%%c1----D
DDEEE
r'   r   )moe_align_block_size@   r   znThere is no support for fast MoE kernel for current quantization method. Falling back to slow implementation. )r   )r4   Tensor.vllm.model_executor.layers.fused_moe.fused_moer   
empty_liker   r   r   ggml_moe_get_block_sizeggml_moe_a8reshapemul_viewmoe_sumr   ggml_moe_a8_vecr2   r3   	enumeratezipfused_mul_mat_ggufadd_)ru   r   r   r   r   rw   r   r   r   r   out_hidden_states
num_tokens_ENtop_k
BLOCK_SIZEsorted_token_ids
expert_idsnum_tokens_post_paddedr   tokwidxinpcurrent_hidden_statewwii	expert_upexpert_downcurrent_states          `                       r&   _fused_moe_ggufr      s   
u| 
 
 
 
 
 
 TSSSSS(++ 	((O++GAJOO
A(1aq!0>>
?S?Sj!@
 @
<*&< o"

 

 c#hho"HQK

 

 kk*eRXa[99>>j%33
 
 	C*++++	*	*	*|?O/O/O
A(1aq!!!R5,:VVc#hh!Xq-!j5>P
 
 kk*eRXa[99>>j%33
 
 	C*++++4	
 	
 	

 's<'B'BCC 	: 	:MC!SC&..!344C#' a++ = =BrF	(iFFc#hh f 2m! !$r((  (/+8(((--m<<<<%9c""r'   c                 *    t          j        |           S r!   )r4   r   )ru   r   r   r   r   rw   r   r   s           r&   _fused_moe_gguf_faker   \  s     Ar'   r   hidden_sizerX   c                    |t           v rt          j        ||           S |t          v rt          j        |         \  }}|                                 }||j        d         |z  |z  k    sJ t          j        |d|          }t          j
        ||||j        d         |          }	 |	j        g | j        |R  S t          |          }t          d|           )Nr   r   )dimindexr   )r   r4   	embeddingr   r-   r   flattenr   index_selectr   r   r   r   r   )
ru   rv   rw   r   rX   r   r   x_flatquantdequants
             r&   _apply_gguf_embeddingr   u  s     (((w***		&	& $ 5l C
IgmA.);jHHHHH"7@@@%<fl1ou
 
 w|2QW2k2222!,//!"W"W"WXXXr'   c                 R    t          j        | j        d         ||| j                  S r   )r4   r   r   r~   )ru   rv   rw   r   rX   s        r&   _apply_gguf_embedding_faker     s%     ;qwqz;eAHMMMMr'   r   c                       e Zd ZdZdefdZdej        j        de	de
e	         de	de	d	ej        fd
Zdej        j        fdZdej        j        fdZ	 ddej        j        dej        dej        dz  dej        fdZdS )rI   z[Linear method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    quant_configc                     || _         d S r!   )r   )r$   r   s     r&   r#   zGGUFLinearMethod.__init__  s    (r'   rC   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                    || _         t          |          }||f}	t          d          }
t          |
dd|	dg g i d           t          |
|           |                    d|
           t          t          j        t          |          t          j	                  d          }t          |ddi dd	           t          ||           |                    d
|           d S )NFrequires_gradr   r   T)	input_dim
output_dimtensor_shapeis_gguf_weightdata_containershard_idshard_id_maprv   rX   )is_gguf_weight_typeweight_typeshard_weight_typeignore_warningrw   )
r   sumGGUFUninitializedParameterr   register_parameterr   r4   r   lenuint8)r$   rC   r   r   r   r   r   extra_weight_attrsoutput_size_per_partitionr   rv   rw   s               r&   create_weightszGGUFLinearMethod.create_weights  s#    )$'(>$?$?!13KL,5AAA ,"&"$ " 	
 	
 	
 	"4555  G444 K2335;GGG
 
 
 	'+ %'"&	 	
 	
 	
 	'9:::  >>>>>r'   c                     |j         j        }|t          v s.|t          v s%t	          |          }t          d| d| d          |                     |           d S )Nz#Unsupported GGUF quantization type z
 in layer rb   )rw   r   r   r   r   rq   _create_padded_weight_param)r$   rC   rw   s      r&   process_weights_after_loadingz.GGUFLinearMethod.process_weights_after_loading  st    )5 111\]5R5R%l33LVlVVeVVV  
 	((/////r'   c                    |j         }|j        }|j        }t          |j        x}          dk    rd |D             }t          |          dk    sJ t          d|                       t          t          |                    }t          d |D                       }t          d |D                       }t          j        ||f||j                  }	t          t          t          t           t           t           f         f                     }
|D ]}}||         }t          d |d|         D                       }|||                             d	          z   }||                             d          }||         |	||d|f<   |||f|
|<   ~|j                                         t'          |	d
          }t)          |t+          |                     t)          |d|
i           |                    d|           dS dS )z;Create padded weight parameter for GGUF MergedLinear layer.r   c                     h | ]	}|j         
S r)   r   )rf   datas     r&   	<setcomp>z?GGUFLinearMethod._create_padded_weight_param.<locals>.<setcomp>  s    ;;;DTZ;;;r'   z!Data container has mixed dtypes: c              3   @   K   | ]}|                     d           V  dS )r   Nsizerf   ru   s     r&   rm   z?GGUFLinearMethod._create_padded_weight_param.<locals>.<genexpr>  ,      @@AaffQii@@@@@@r'   c              3   @   K   | ]}|                     d           V  dS r   Nr  r
  s     r&   rm   z?GGUFLinearMethod._create_padded_weight_param.<locals>.<genexpr>  r  r'   r}   c              3   @   K   | ]}|                     d           V  dS r  r  r
  s     r&   rm   z?GGUFLinearMethod._create_padded_weight_param.<locals>.<genexpr>  s,      PP!AFF1IIPPPPPPr'   Nr   Fr   shard_offset_maprv   )rv   r   r   r   r   rq   nextitermaxr   r4   zerosr~   r[   rW   tuplerZ   r	  clearr   r   varsr   )r$   rC   rv   r   r   r   rX   padded_sideconcat_sidepadded_datar  r   id_in_containerstartendr	  padded_params                    r&   r  z,GGUFLinearMethod._create_padded_weight_param  s)   -+#!77~881<<;;N;;;Eu::???J;E;;% %??? e%%E@@@@@@@K@@@@@@@K  +k*%  K  $CsC})=$=>@@ ; ;".s"3PP~>N>N/OPPPPPn_=BB1EEE%o6;;A>>0>0OE#Iuu,-).T(: %%"((***$[FFFL\4==999\,>@P+QRRR$$Y=====7 =<r'   Nru   biasr   c           
         |j         j        }|rd|v rg dn|}|j         }g }|D ]l}|j         j        |         \  }}	}
|j        j        |         }|                    t          ||||	d |
f                                         |                     mt          j	        |d          }n$|j         }|j        j
        }t          |||          }||                    |           |S )Nq)r   kvr   )axis)rv   r   r  rw   r   appendr   
contiguousr4   catr   r   )r$   rC   ru   r  r   rv   resultr   r  r  offsetrw   r   s                r&   applyzGGUFLinearMethod.apply  s    =) 	?*-//xHmGF  %*]%CC%H"sF$1CCH&759gvg#56AACC\    
 )F+++CCmG -9L$Q>>CHHTNNN
r'   r!   )rR   rS   rT   rU   r   r#   r4   r\   r]   rZ   rV   rX   r   r  r  r   r)  r)   r'   r&   rI   rI     s$        )Z ) ) ) ),?x,? #&,? !%S		,?
 ,? ,? k,? ,? ,? ,?\	058? 	0 	0 	0 	0 >  >  >  >  >L %)	 x < lT!	
 
     r'   rI   c                        e Zd ZdZdedef fdZdej        j	        de
de
de
d	ej        f
d
Zdej        j	        dedz  fdZdedej        dej        dej        dej        eej        ej        f         z  f
dZ xZS )rK   zXMoE method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    r   moec                 X    t                                          |           || _        d S r!   )r"   r#   r   )r$   r   r+  r%   s      r&   r#   zGGUFMoEMethod.__init__-  s+    
 	(r'   rC   num_expertsr   intermediate_size_per_partitionr   c           	         |d|z  |f}t          d          }t          |dd|dg d           t          ||           |                    d|           t          t	          j        dt          j        	          d          }	t          |	dddd
           t          |	|           |                    d|	           |||f}t          d          }
t          |
dd|dg d           t          |
|           |                    d|
           t          t	          j        dt          j        	          d          }t          |dddd
           t          ||           |                    d|           d S )Nr{   Fr   r   r   T)r   r   r   r   r   w13_qweightr   )r   r   r   w13_qweight_type
w2_qweightw2_qweight_type)r   r   r   r   r4   r   r   )r$   rC   r-  r   r.  r   r   r   r0  r1  r2  r3  s               r&   r   zGGUFMoEMethod.create_weights5  s    $Q)H%H+V0uEEE ,"&"$ 		
 		
 		
 	&8999  <<<$K---U
 
 
 	$(dSS	
 	
 	
 	)+=>>>  !35EFFF#%DkR/eDDD
 ,"&"$ 		
 		
 		
 	%7888  z:::#K---U
 
 
 	$(dSS	
 	
 	

 	*<===  !2ODDDDDr'   r   Nc                     d S r!   r)   )r$   rC   s     r&   get_fused_moe_quant_configz(GGUFMoEMethod.get_fused_moe_quant_configs  s	     tr'   ru   r   r   c           
          |j         dk    s
J d            |j        rt          d          t          ||j        |j        |||j        j        |j        j        |j                   S )Nr   z"Only SiLU activation is supported.zGApply router weight on input is not supported forfused GGUF MoE method.)	r   apply_router_weight_on_inputr   fused_moe_ggufr0  r2  r1  r   r3  )r$   rC   ru   r   r   s        r&   r)  zGGUFMoEMethod.applyx  s     6)))+O)))- 	%)  
 ".!-	
 	
 		
r'   )rR   rS   rT   rU   r   r   r#   r4   r\   r]   rZ   rX   r   r   r5  r   r   r  r)  r^   r_   s   @r&   rK   rK   &  s9        ) ) ) ) ) ) ) )<Ex<E <E 	<E
 *-<E k<E <E <E <E|X_	t	#   


 <
 l	

 ,
 
elEL89	9
 
 
 
 
 
 
 
r'   rK   c                   N    e Zd ZdZdej        j        dej        dej        fdZdS )rJ   z^Embedding method for GGUF.

    Args:
        quant_config: The GGUF quantization config.
    rC   ru   r   c                 t    |j         }|j        j        }|j        d         }t	          ||||| j                  S )Nr   r   )rv   rw   r   r   apply_gguf_embeddingr   )r$   rC   ru   rv   rw   r   s         r&   r   zGGUFEmbeddingMethod.embedding  sF    -)5*1-#wk9J
 
 
 	
r'   N)	rR   rS   rT   rU   r4   r\   r]   r   r   r)   r'   r&   rJ   rJ     sO         
ux 
5< 
EL 
 
 
 
 
 
r'   rJ   c                   4    e Zd ZU eZeej                 ed<   dS )r   r   N)	rR   rS   rT   r   cls_to_becomerV   r4   r   __annotations__r)   r'   r&   r   r     s*         M&&&&&&r'   r   r!   )ccollections.abcr   typesr   typingr   r   r-   r4   r   r   torch.nn.parameterr   r	   vllmr
   r   vllm.loggerr   +vllm.model_executor.layers.fused_moe.configr   r   *vllm.model_executor.layers.fused_moe.layerr   r   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   3vllm.model_executor.layers.vocab_parallel_embeddingr   r    vllm.model_executor.models.utilsr   vllm.model_executor.utilsr   vllm.platformsr   vllm.utils.torch_utilsr   rR   r2   r   rW   rV   rG   F32F16BF16r   Q4_0Q4_1Q5_0Q5_1Q8_0Q8_1STANDARD_QUANT_TYPESQ2_KQ3_KQ4_KQ5_KQ6_KKQUANT_TYPESIQ1_MIQ1_SIQ2_XXSIQ2_XSIQ2_SIQ3_XXSIQ3_SIQ4_XSIQ4_NLr   r   r   r   r   rZ   r   r   r   AttributeErrorerrorr   r   r8  rX   r   r   r;  rI   rK   rJ   r   r)   r'   r&   <module>rj     sm   $ # # # # # " " " " " "                   3 3 3 3 3 3 @ @ @ @ @ @ @ @ # # # # # # # # # # # #                       
 H G G G G G               ; : : : : : 6 6 6 6 6 6 + + + + + + < < < < < <	X		@ @ @ @ @# @ @ @L .>-=b-A-A" ""c" 3S	>*" " " "J  ^Z^Z_E OOOOOO  OOOOO 
  %|36II',69LL &5 | #l := 
\       FU|U\U U \	U U U U	%#*   
 ;   
Kj|jj 	j ,	j
 lj j j j \j j j jZ
|

 	
 ,	

 l
 
 
 
 \
 
 
 
	!&   
 Y^3NN   
K !%Y Y|Y\Y Y 	Y
 ;Y \Y Y Y Y8 !%N N|N\N N 	N
 ;N \N N N N	'%,   
 !9>?   
KA A A A A' A A AHi
 i
 i
 i
 i
& i
 i
 i
X
 
 
 
 
* 
 
 
"' ' ' ' '!7 ' ' ' ' 'sH   F? ?GGG7I I#II#9K K%K  K%