
    .`iz                        d dl m Z  d dlmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmc mc mc mZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZAmBZB d dlCmDZDmEZEmFZF d dlGmHZH d dlImJZJmKZKmLZL d dlMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZV d dlWmXZXmYZY d dlZm[Z[m\Z\m]Z]m^Z^ d dl_m`Z` d dlambZbmcZc erd dldmeZe  eef          Zgg dZhdgZi G d d e;          Zj G d! d"e6          Zk G d# d$ek          Zl G d% d&e1          Zm G d' d(e1          Zn G d) d*e1          Zo G d+ d,e          Zpemel_q        epel_r        ejel_s         G d- d.ek          Zt G d/ d0e1          Zu G d1 d2e          Zveuet_q        evet_r        ejet_s        dS )3    )fnmatch)TYPE_CHECKINGAnyOptionalN)Module)	Parameter)cutlass_scaled_fp4_mmscaled_fp4_quant)	Attention)init_logger)FusedMoEConfigFusedMoEQuantConfig)FusedMoEFusedMoEMethodBaseFusedMoeWeightScaleSupported)Fp8MoeBackend convert_to_fp8_moe_kernel_formatmake_fp8_moe_kernelmake_fp8_moe_kernel_for_mkmmake_fp8_moe_quant_configselect_fp8_moe_backend)NvFp4MoeBackend"convert_to_nvfp4_moe_kernel_format(is_global_sf_supported_for_nvfp4_backendmake_nvfp4_moe_kernelmake_nvfp4_moe_kernel_for_mkmmake_nvfp4_moe_quant_configselect_nvfp4_moe_backend)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)init_fp8_linear_kernel)BaseKVCacheMethod)1build_flashinfer_fp4_cutlass_moe_prepare_finalizeflashinfer_trtllm_fp4_moe flashinfer_trtllm_fp4_routed_moe)"apply_fi_trtllm_fp8_per_tensor_moe1build_flashinfer_fp8_cutlass_moe_prepare_finalize)W8A8BlockFp8LinearOp%process_fp8_input_tensor_strategy_moe&process_fp8_weight_tensor_strategy_moe)get_marlin_input_dtype)apply_fp4_marlin_linearis_fp4_marlin_supportedprepare_fp4_layer_for_marlin)	
GroupShapecutlass_fp4_supportedis_layer_skippedkFp8DynamicTokenSymkFp8StaticTensorSymkFp8StaticTokenSymkNvfp4DynamickNvfp4Staticswizzle_blockscale)cutlass_block_fp8_supportedrequantize_with_max_scale)BlockQuantScaleParameterChannelQuantScaleParameterModelWeightParameterPerTensorScaleParameter)replace_parameter)flashinfer_scaled_fp4_mmhas_flashinfer)WeightsMapper)FP8FP8_PER_CHANNEL_PER_TOKEN	FP8_PB_WONVFP4rF   c                   $     e Zd ZdZd fdZ xZS )ModelOptFp8KVCacheMethodzI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    quant_configModelOptQuantConfigBasec                 J    t                                          |           d S N)super__init__)selfrL   	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/modelopt.pyrQ   z!ModelOptFp8KVCacheMethod.__init__   s!    &&&&&    )rL   rM   )__name__
__module____qualname____doc__rQ   __classcell__rS   s   @rT   rK   rK   {   sG         ' ' ' ' ' ' ' ' ' 'rU   rK   c                   X    e Zd ZU eZeed<   eZeed<   e	Z
eed<   dee         f fdZdedefdZd	ej        j        deded
         fdZddZedee         fd            Zedededz  dee         deeef         dedz  dd fd            Zedeeef         dd fd            Z xZS )rM   LinearMethodClsFusedMoEMethodClsKVCacheMethodClsexclude_modulesc                 V    t                                                       || _        d S rO   )rP   rQ   r`   )rR   r`   rS   s     rT   rQ   z ModelOptQuantConfigBase.__init__   s)     	*9rU   prefixreturnc                 4   t          | j                  dk    rdS t          || j        | j                  rdS | j        D ];}||k    r3||v s,|                    d          r||                    d          v r dS <| j        D ]}t          ||          r dS dS )z
        Check if a layer should be excluded from quantization.

        Handles both exact matching (for fused layers) and ModelOpt wildcard matching.

        The ModelOpt exclude_modules list is a list of wildcards.
        r   FTzlanguage_model.)lenr`   r5   packed_modules_mapping
startswithremoveprefixr   )rR   rb   exclude_modulewildcard_patterns       rT   is_layer_excludedz)ModelOptQuantConfigBase.is_layer_excluded   s     t#$$))5 FD$8$:UVV 	4 #2 		 		N''&((%%&788 ) '&*=*=>O*P*PPP tt !% 4 	 	v/00 tt urU   layerr$   c                 \   t          |t                    r|                     |           S |                     |          r%t          |t                    rt                      S d S d|v sd|v rt                      S t          |t                    r@|                     |           }t          |dd          dk    rt          |          |_	        |S t          |t                    rG|                     | |j                  }t          |dd          dk    rt          |          |_	        |S d S )Nvision_towervision_modelbackend marlin)rL   
moe_config)
isinstancer   r_   rk   r   r!   r]   getattrr/   marlin_input_dtyper   r^   rs   )rR   rl   rb   quant_methods       rT   get_quant_methodz(ModelOptQuantConfigBase.get_quant_method   sD    eY'' 	/((... !!&)) 	%,, 1.0004 V##~'?'?*,,, eZ(( 	 //55L|Y33x??2H2P2P/x(( 	 11!e.> 2  L |Y33x??2H2P2P/trU   hf_to_vllm_mapperrE   c                    t          | j                  dk    rg }| j        D ]}t          |          dk    rV|d         dk    rJ|d         dk    r>|                    |d d                    |                    |d d         dz              k|                    |           |                    |          | _        d S d S )Nr      *.z.*)re   r`   append
apply_list)rR   ry   new_exclude_modulesexcludes       rT   apply_vllm_mapperz)ModelOptQuantConfigBase.apply_vllm_mapper   s    t#$$q(( #%/ 8 8w<<1$$););s@R@R'..wss|<<<'..wss|d/BCCCC'..w7777#4#?#?@S#T#TD   % )(rU   c                      dgS )Nzhf_quant_config.json r   rU   rT   get_config_filenamesz,ModelOptQuantConfigBase.get_config_filenames   s    &''rU   rw   kv_cache_quant_methodNoriginal_config
group_sizec                     t          d          )Nz-Please implement this function in sub classes)NotImplementedError)clsrw   r   r`   r   r   s         rT   _from_configz$ModelOptQuantConfigBase._from_config   s     ""QRRRrU   configc                 t   d|v r|                      |dg          }t          |t                    st          d          |                    d          }|                    d          }|                    d          }|                    dg           }nU|                    d          }|                    d          }|                    dg           }|                    d          }|st          d          t          |                                          }|nHt          |t
                    st          d	t          |                     |                                }t          |t                    st          d
t          |                     |d }n^t          |t                    r|}nF	 t          |          }n5# t          t          f$ r! t          dt          |                     d w xY w|t          vrt          dt           d          |                     |||||          S )Nquantizationz4Expected 'quantization' to be a dictionary in config
quant_algokv_cache_quant_algor   r`   ignorez+Missing 'quant_algo' in quantization configz*kv_cache_quant_algo must be a string, got z$exclude_modules must be a list, got z#group_size must be an integer, got z"ModelOpt currently only supports: zj quantizations in vLLM. Please check the `hf_quant_config.json` file for your model's quant configuration.)rw   r   r`   r   r   )get_from_keysrt   dict
ValueErrorgetstruppertypelistint	TypeErrorQUANT_ALGOSr   )r   r   rL   rw   r   group_size_rawr`   r   s           rT   from_configz#ModelOptQuantConfigBase.from_config   s    V## ,,Vn5EFFLlD11 Y !WXXX'++L99L %1$4$45J$K$K! *--l;;N +../@"EEOO "::l33L$*JJ/D$E$E!$jj266O#ZZ55N 	LJKKK <((..00 (1377 	B1-..1 1  
 %:$?$?$A$A!/400 	NtO7L7LNN   !JJ,, 	'JJ 00

	*    P$~:N:NPP 
 {**'[ ' ' '   %"7+!"   
 
 	
s   :G
 
2G<)ry   rE   )rV   rW   rX   r    r]   r   __annotations__r   r^   r&   r_   r   r   rQ   boolrk   torchnnr   r   rx   r   staticmethodr   classmethodr   r   r   r   r   rZ   r[   s   @rT   rM   rM      s        ,OT,,,0t000.d...:c: : : : : :$ $ $ $ $ $L#X_#.1#	&	'# # # #JU U U U* ($s) ( ( ( \( 	S 	S  #Tz		S
 c	S c3h	S $J	S 
#	S 	S 	S [	S K
c3h K
4M K
 K
 K
 [K
 K
 K
 K
 K
rU   rM   c                       e Zd ZdZdedededz  dee         ddf
 fdZdefd	Z	dee
j                 fd
Zedefd            Zededz  fd            Zedededz  dee         deeef         dedd fd            Z xZS )ModelOptFp8ConfigzConfig class for ModelOpt FP8.rw   is_checkpoint_fp8_serializedr   Nr`   rc   c                 p   t                                          |           || _        || _        || _        |rt
                              d|           | j        dk    rt          | _        d S | j        dk    rt          | _        d S | j        dk    rt          | _        d S t          d| j         d          )NzoDetected ModelOpt fp8 checkpoint (quant_algo=%s). Please note that the format is experimental and could change.rF   rG   rH   z.Unsupported ModelOpt FP8 quant_algo for vLLM: z9. Supported: FP8 / FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO.)rP   rQ   rw   r   r   loggerwarningModelOptFp8LinearMethodr]   ModelOptFp8PcPtLinearMethodModelOptFp8PbWoLinearMethodr   )rR   rw   r   r   r`   rS   s        rT   rQ   zModelOptFp8Config.__init__R  s     	)))(,H)%:"' 	NND   %%#:D   "===#>D   +--#>D   9$9 9 9  rU   c                     dS )Nmodeloptr   rR   s    rT   get_namezModelOptFp8Config.get_namer  s    zrU   c                 2    t           j        t           j        gS rO   )r   bfloat16halfr   s    rT   get_supported_act_dtypesz*ModelOptFp8Config.get_supported_act_dtypesu  s    
++rU   c                     dS )NY   r   r   s    rT   get_min_capabilityz$ModelOptFp8Config.get_min_capabilityx      rrU   c                    |dS |                     dd                                          }|dk    rdS d|v rY|d         }t          |t                    r;t	          |                     dd                    }d|                                v rdS n;t	          |                     dd                    }d|                                v rdS dS )zSDetect if this ModelOpt config should be used based on
        quantization config.Nrw   rq   r   r   r   rF   r   lowerrt   r   r   r   r   hf_quant_cfg
user_quantrw   rL   r   s         rT   override_quantization_methodz.ModelOptFp8Config.override_quantization_method|  s     4 $'';;AACC :%%4 \))'7L,-- & !1!1,!C!CDD
J,,....%: \--lB??@@J
((****!ztrU   r   kwargsc                &    d|v } | ||||          S )NrF   r   )r   rw   r   r`   r   r   r   s          rT   r   zModelOptFp8Config._from_config  s/     (-'<$s(!	
 
 	
rU   )rV   rW   rX   rY   r   r   r   rQ   r"   r   r   dtyper   r   r   r   r   r   r   r   rZ   r[   s   @rT   r   r   O  sz       (( '+  #Tz	
 c 
     @-    ,$u{*; , , , , 3    [ 	t	#   [> 
 
  #Tz	

 c
 c3h
 
 

 
 
 [
 
 
 
 
rU   r   c                       e Zd ZdZdeddfdZdej        j        de	de
e	         d	e	d
e	dej        fdZdeddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )r   a  Linear method for Model Optimizer static quantization.
    Supports loading FP8 checkpoints with static weight scale and
    activation scale. Future support might be added for dynamic
    scales.

    Limitations:
    1. Only support per-tensor quantization due to torch._scaled_mm support.
    2. Only support float8_e4m3fn datatype
        Args: quant_config: The ModelOpt quantization config.
    rL   rc   Nc                     || _         t          t          t          t          j                    | j        j                  | _        d S N)activation_quant_keyweight_quant_key	out_dtypemodule_name)rL   r%   r7   r   get_default_dtyperS   rV   
fp8_linearrR   rL   s     rT   rQ   z ModelOptFp8LinearMethod.__init__  s>    (0!40-///	
 
 
rU   rl   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 0   ~~t          |          }|                    d          }	||_        ||_        ||_        | j        j        rt          j        n|}
t          t          j
        |||
          dd|	          }|                    d|           | j        j        rt          t          j
        t          |          t          j                  |	          }t          j        t          j                  j        |d d <   |                    d|           t          t          j
        t          |          t          j                  |	          }t          j        t          j                  j        |d d <   |                    d	|           d S d S )
Nweight_loaderr      r   data	input_dim
output_dimr   weightr   r   weight_scaleinput_scale)sumr   logical_widthsr   output_size_per_partitionrL   r   r   float8_e4m3fnr@   emptyregister_parameterrA   re   float32finfomin)rR   rl   r   r   r   r   r   extra_weight_attrsr   r   weight_dtyper   r   scales                 rT   create_weightsz&ModelOptFp8LinearMethod.create_weights  s    $'(>$?$?!*..??5)A&*C'  =E 	
 &)+C<   '
 
 
 	  62229 	;2[%;!<!<EMRRR+  L $k%-88<LO$$^\BBB+[%;!<!<EMRRR+  E
 {5=115E!!!H$$]E:::::	; 	;rU   c                    |j         }|j                                        }|j        |j        d         k                                    s#t	          |j         |j        |j                  \  }}t          |                                d          |_         t          |d          |_        t          |j                                        d          |_        d S )Nr   Frequires_grad)	r   r   maxallr=   r   r   tr   )rR   rl   r   max_w_scales       rT   process_weights_after_loadingz5ModelOptFp8LinearMethod.process_weights_after_loading  s    (,,.."e&8&;;@@BB 	";e0%2F# #K !5AAA&{%HHH%e&7&;&;&=&=USSSrU   xbiasc                 :    | j                             |||          S rO   r   apply_weightsrR   rl   r   r   s       rT   applyzModelOptFp8LinearMethod.apply        ,,UAt<<<rU   rO   rV   rW   rX   rY   r   rQ   r   r   r   r   r   r   r   r   Tensorr   r   rU   rT   r   r     s       	 	
%6 
4 
 
 
 
.;x.; #&.; !%S		.;
 .; .; k.; .; .; .;`	T6 	Td 	T 	T 	T 	T %)	= =x= <= lT!	=
 
= = = = = =rU   r   c                       e Zd ZdZdeddfdZdej        j        de	de
e	         d	e	d
e	dej        fdZdeddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )r   a#  Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints.

    Expected checkpoint structure (per Linear):
    - weight: fp8-e4m3fn, shape [out, in]
    - weight_scale: fp32, shape [out] (per-output-channel)
    - no input_scale (activations are dynamically quantized per-token)
    rL   rc   Nc                     || _         t          t          t          t	          j                    | j        j                  | _        d S r   )	rL   r%   r6   r8   r   r   rS   rV   r   r   s     rT   rQ   z$ModelOptFp8PcPtLinearMethod.__init__  s>    (0!4/-///	
 
 
rU   rl   r   r   r   r   r   c                    ~~| j         j        st          d          t          |          }|                    d          }	||_        ||_        ||_        t          t          j
        ||t          j                  dd|	          }
|                    d|
           t          t          j
        |t          j                  d|	          }t          j        t          j                  j        |d d <   |                    d	|           d S )
NzMFP8_PER_CHANNEL_PER_TOKEN currently only supports FP8-serialized checkpoints.r   r   r   r   r   r   )r   r   r   r   )rL   r   r   r   r   r   r   r   r@   r   r   r   r   r?   r   r   r   )rR   rl   r   r   r   r   r   r   r   r   r   r   s               rT   r   z*ModelOptFp8PcPtLinearMethod.create_weights  s*     = 	.  
 %((>$?$?!*..??5)A&*C'%)()  
 '	
 	
 	
 	  622216emLLL'
 
 

  +em448QQQ  >>>>>rU   c                     t          |j                                        d          |_        t          |j        j        d          |_        d S )NFr   )r   r   r   r   r   rR   rl   s     rT   r   z9ModelOptFp8PcPtLinearMethod.process_weights_after_loadingG  sC     !1!1GGG&u'9'>eTTTrU   r   r   c                 :    | j                             |||          S rO   r   r   s       rT   r   z!ModelOptFp8PcPtLinearMethod.applyK  r   rU   rO   r   r   rU   rT   r   r   	  s        
%6 
4 
 
 
 
*?x*? #&*? !%S		*?
 *? *? k*? *? *? *?XU6 Ud U U U U %)	= =x= <= lT!	=
 
= = = = = =rU   r   c                       e Zd ZU dZdZeeef         ed<   deddfdZ	de
j        j        d	ed
ee         dedede
j        fdZdeddfdZ	 dde
j        j        de
j        de
j        dz  de
j        fdZdS )r   a  Linear method for ModelOpt FP8_PB_WO checkpoints.

    ModelOpt exports `weight_scale` as a 4D tensor:
      [out_blk, 1, in_blk, 1]
    where block size is typically 128 for both dims.

    vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant.
    )   r	  _WEIGHT_BLOCK_SIZErL   rc   Nc                     || _         | j        \  }}t          | j                  | _        t	          t          ||          t          d|          t                      d          | _        d S )Nr   F)weight_group_shapeact_quant_group_shaper<   use_aiter_and_is_supported)rL   r
  r   weight_block_sizer,   r3   r<   w8a8_block_fp8_linear)rR   rL   block_nblock_ks       rT   rQ   z$ModelOptFp8PbWoLinearMethod.__init__`  sm    (2!%d&=!>!>%9)'7;;",Q"8"8(C(E(E',	&
 &
 &
"""rU   rl   r   r   r   r   r   c                    ~~| j         j        st          d          t          |          }|                    d          }	||_        ||_        ||_        | j        |_        t          t          j        ||t          j                  dd|	          }
|                    d|
           | j        \  }}||z  dk    rt          d| d	| d
          ||z  dk    rt          d| d	| d
          ||z  }||z  }t          t          j        |d|dft          j                  dd|	          }t          j        t          j                  j        |d d <   |                    d|           d S )Nz=FP8_PB_WO currently only supports FP8-serialized checkpoints.r   r   r   r   r   r   z6ModelOpt FP8_PB_WO requires out_features divisible by z, got r   z5ModelOpt FP8_PB_WO requires in_features divisible by r{   r   )rL   r   r   r   r   r   r   r   r  r@   r   r   r   r   r
  r>   r   r   r   )rR   rl   r   r   r   r   r   r   r   r   r   r  r  out_blksin_blksr   s                   rT   r   z*ModelOptFp8PbWoLinearMethod.create_weightsk  s     = 	O   %((>$?$?!*..??5)A&*C' #'"8%)()  
 '	
 	
 	
 	  62222$w.!33?? ?";? ? ?   $g-22>> >":> > >  
 -7*g5 0h7A6emLLL'	
 
 
  +em448QQQ  >>>>>rU   c                    t          |j        j        d          |_        |j        }|                                dk    r)|                    d                              d          }n=|                                dk    r%t          dt          |j                   d          t          |	                                d          |_        d S )	NFr      r   r|   r{   z2Unexpected ModelOpt FP8_PB_WO weight_scale shape: r   )
r   r   r   r   dimsqueezer   tupleshape
contiguous)rR   rl   r   s      rT   r   z9ModelOptFp8PbWoLinearMethod.process_weights_after_loading  s     !2%HHH"99;;!MM!$$,,R00EEYY[[A)%%) ) )  
 'u'7'7'9'9OOOrU   r   r   c                 T    | j                             ||j        |j        d |          S )N)inputr   r   r   r   )r  r   r   r   r   s       rT   r   z!ModelOptFp8PbWoLinearMethod.apply  s8     )//<+ 0 
 
 	
rU   rO   )rV   rW   rX   rY   r
  r  r   r   r   rQ   r   r   r   r   r   r   r   r  r   r   rU   rT   r   r   T  s2          +5c3h444	
%6 	
4 	
 	
 	
 	
??x?? #&?? !%S		??
 ?? ?? k?? ?? ?? ??BP6 Pd P P P P( %)	
 
x
 <
 lT!	

 

 
 
 
 
 
rU   r   c                       e Zd ZdZdededdf fdZedej	        dz  fd            Z
	 d"deej        ej        ej        f         dz  dej        dz  f fd	Zd
ej        dej        j        dej        fdZdej        j        dedededej	        f
dZdej        j        dej        dej        dej        dej        dej        dej        fdZdej        j        ddfdZdej        j        dedz  fdZedefd            Zdedej        dej        dej        eej        ej        f         z  fdZdedej        dej        d ej        dej        eej        ej        f         z  f
d!Z xZS )#ModelOptFp8MoEMethodzMoE method for ModelOpt FP8.
    Supports loading FP8 checkpoints with static weight scale and
    activation scale.
    Args:
        quant_config: The ModelOpt quantization config.
    rL   rs   rc   Nc                     t                                          |           || _        | j        j        sJ t	          | j        t          t                    \  | _        | _        d | _	        d S N)r   
weight_keyactivation_key)
rP   rQ   rL   r   r   moer7   fp8_backendexperts_clskernelrR   rL   rs   rS   s      rT   rQ   zModelOptFp8MoEMethod.__init__  sq    
 	$$$( ==== .D8*..
 .
 .
*$* 8<rU   c                 P    | j         | j         j                                        S d S rO   r(  prepare_finalizetopk_indices_dtyper   s    rT   r-  z'ModelOptFp8MoEMethod.topk_indices_dtype  &    ;";/BBDDDtrU   routing_tablesc                 >   | j         t          j        k    rd S | j         t          j        k    rP| j        j        j        sd S t          | j        d          }t          	                    d|j
        j                   |S t                                          |          S )NF)use_deepseek_fp8_block_scale%s)r&  r   FLASHINFER_TRTLLMFLASHINFER_CUTLASSr%  moe_parallel_configuse_all2all_kernelsr+   r   
debug_oncerS   rV   rP   maybe_make_prepare_finalizerR   r/  r,  rS   s      rT   r8  z0ModelOptFp8MoEMethod.maybe_make_prepare_finalize  s    
 }>>>4!AAA8/C tP-2      d$4$>$GHHH##ww22>BBBrU   r,  rl   c                 j    | j         J | j        J t          | j        | j         | j        |          S N)rs   rL   r'  r,  )moe_quant_configr'  r   r%  rR   r,  rl   s      rT   select_gemm_implz%ModelOptFp8MoEMethod.select_gemm_impl   sM    
 $000+++*x.(-	
 
 
 	
rU   num_expertshidden_sizeintermediate_size_per_partitionr   c                    ||_         ||_        | j        j        rt          j        n|}|                    d          }| j        j        rdnd}	t          t	          j
        ||	|z  ||          dd|          }
|                    d|
           t          t	          j
        ||||          dd|          }|                    d|           t          t	          j        ||	fdt          j                  |	          }t          t	          j        |fdt          j                  |	          }|                    d
|           |                    d|           t          t	          j        |fdt          j                  |	          }t          t	          j        |fdt          j                  |	          }|                    d|           |                    d|           d S )Nr   r{   r   r   r   
w13_weight	w2_weightg      ?r   w13_weight_scalew2_weight_scalew13_input_scalew2_input_scale)
orig_dtyper?  rL   r   r   r   r   r%  is_act_and_mulr@   r   r   rA   fullr   )rR   rl   r?  r@  rA  r   r   r   r   w13_num_shardsrC  rD  rE  rF  rG  rH  s                   rT   r   z#ModelOptFp8MoEMethod.create_weights  sJ    ('
  =E 	
 +..??"h5<1)!@@"	   '

 

 


 	  z:::(/"	   '

 

 

	 	  i888 3n-m  
 (
 
 
 2[NCu}EEE'
 
 
 	  !35EFFF  !2ODDD 2[NCu}EEE'
 
 
 1[NCu}EEE'
 
 
 	  !2ODDD  !1>BBBBBrU   w13w2	w13_scalew2_scalerG  rH  c           
         t          | j        |||||||          \  }}}}t          |d|           t          |d|           t          |d|           t          |d|           |                     |          | _        | j        r?| j        J t          | j        | j        | j        | j                  \  | _        | _	        d S d S )N)r&  rl   rM  rN  rO  rP  rG  rH  rC  rD  rE  rF  )r<  rs   r&  r'  )
r   r&  rB   get_fused_moe_quant_configr<  r'  r   r%  r(  use_inplace)rR   rl   rM  rN  rO  rP  rG  rH  s           rT   _setup_kernelz"ModelOptFp8MoEMethod._setup_kernel]  s     (H(+)	(
 	(
 	(
$RH 	%s333%b111%!3Y???%!2H=== !% ? ? F F  	#///,?!%!68 , ,	- - -)DK)))	 	rU   c           	      j   |j         }|j        }|j        }|j        }|j        }|j        }t          ||          \  }}t          |d|           t          |d|           |j        }t          ||||j         j
        d         | j        j                  \  }}|                     |||||||           d S )NrG  rH  r   )r?  rJ  )rC  rD  rE  rF  rG  rH  r-   rB   rA  r.   r  r%  rJ  rT  )	rR   rl   rM  rN  rO  rP  rG  rH  
shard_sizes	            rT   r   z2ModelOptFp8MoEMethod.process_weights_after_loading  s    _*	(/- +P^+
 +
' 	%!2ODDD%!1>BBB :
?(.q182
 
 
Y 	3Ix.	
 	
 	
 	
 	
rU   c                 l    |j         }|j        }|j        }|j        }t	          | j        ||||          S )N)r&  w1_scalerP  a1_scalea2_scale)rE  rF  rG  rH  r   r&  )rR   rl   rX  rP  rY  rZ  s         rT   rR  z/ModelOptFp8MoEMethod.get_fused_moe_quant_config  sM     )(('((
 
 
 	
rU   c                 ,    | j         t          j        k    S rO   )r&  r   r3  r   s    rT   is_monolithicz"ModelOptFp8MoEMethod.is_monolithic  s    =#BBBrU   r   router_logitsc                 &   | j         sJ | j        t          j        k    sJ |j        rt          d          |j        dk    sJ d|j                     |j        rJ t          ||||j	        |j
        |j        |j        |j        |j        	  	        S )Nz9EPLB not supported for FlashInfer TRTLLM FP8 MoE Backend.siluz#Expected 'silu' activation but got )	rl   hidden_statesr]  routing_biasglobal_num_expertstop_knum_expert_group
topk_groupapply_router_weight_on_input)r\  r&  r   r3  enable_eplbr   
activationrenormalizer*   e_score_correction_biasrb  rc  rd  re  rf  rR   rl   r   r]  s       rT   apply_monolithicz%ModelOptFp8MoEMethod.apply_monolithic  s     !!!!=#BBBBB 	%K  
 6)))D%2BDD *)) $$$$1'6$7+"3').)K

 

 

 
	
rU   topk_weightstopk_idsc                 
   | j         rJ | j        t          j        k    r|j        dv sJ d|j                     | j        J |                     ||j        |j        ||| j        |j        |j	        |j
        |j        
  
        S )N)r_  relu2_no_mulz>Expected activation to be in ('silu', 'relu2_no_mul'),but got inplacerh  rb  
expert_maprf  )r\  r&  r   r4  rh  r(  rC  rD  rS  rb  rs  rf  rR   rl   r   rm  rn  s        rT   r   zModelOptFp8MoEMethod.apply  s     %%%% }???#'????. +. . @??
 {&&&{{O$'$7').)K  
 
 	
rU   rO   ) rV   rW   rX   rY   r   r   rQ   propertyr   r   r-  r  r  mkFusedMoEPrepareAndFinalizer8  r   r   FusedMoEPermuteExpertsUnpermuter>  r   r   rT  r   r   rR  r   r\  r   rl  r   rZ   r[   s   @rT   r   r     s&        <'< #< 
	< < < < < <& EK$$6    X RVC CelEL%,FG$NC 
	&	-C C C C C C(
7
 x
 
	+	
 
 
 
MCxMC MC 	MC
 *-MC kMC MC MC MC^%x% \% L	%
 <% ,% % % % % %N
58? 
t 
 
 
 
>
X_
	t	#
 
 
 
  Ct C C C XC

 <
 |	

 
elEL89	9
 
 
 
<

 <
 l	

 ,
 
elEL89	9
 
 
 
 
 
 
 
rU   r   c                       e Zd ZdZ	 ddededz  dee         deddf
 fd	Zde	fd
Z
deej                 fdZedefd            Zede	dz  fd            Zedededz  dee         deeef         dedz  dedd fd            Z xZS )ModelOptNvFp4ConfigzConfig class for ModelOpt FP4.   is_checkpoint_nvfp4_serializedr   Nr`   r   rc   c                     t                                          |           || _        |r*t                              d           || _        || _        d S d S )NzkDetected ModelOpt NVFP4 checkpoint. Please note that the format is experimental and could change in future.)rP   rQ   r|  r   r   r   r   )rR   r|  r   r`   r   rS   s        rT   rQ   zModelOptNvFp4Config.__init__  sj     	))).L+) 	;NNJ  
 )DO':D$$$	; 	;rU   c                     dS )Nmodelopt_fp4r   r   s    rT   r   zModelOptNvFp4Config.get_name  s    ~rU   c                 H    t           j        t           j        t           j        gS rO   )r   r   r   r   r   s    rT   r   z,ModelOptNvFp4Config.get_supported_act_dtypes  s    
E,?@@rU   c                     dS )NK   r   r   s    rT   r   z&ModelOptNvFp4Config.get_min_capability  r   rU   c                 p   |dS |                     dd                                          }|dk    rdS d|v r:|d         }t          |t                    r|                     dd          }d|v rdS nC|                     dd          }t          |t                    rd	|                                v rdS dS )
zWDetect if this ModelOpt FP4 config should be used based on
        quantization config.Nrw   rq   r   r   r   rI   r  FP4r   r   s         rT   r   z0ModelOptNvFp4Config.override_quantization_method  s     4 $'';;AACC :%%4 \))'7L,-- *)--lB??
j(()> &)),;;J*c** &u
8H8H8J8J/J/J%~trU   rw   r   r   r   c                   
 d|v }|d}|r2d|v r.|d         
g d}
fd|D             }	|	rt          d|	            | ||||          S )NrI   r{  r   )r   r   r`   c                     g | ]}|v|	S r   r   ).0fieldrL   s     rT   
<listcomp>z4ModelOptNvFp4Config._from_config.<locals>.<listcomp>N  s*       e<6O6O6O6O6OrU   zJNVFP4 quantization requires the following fields in hf_quant_config.json: )r   )r   rw   r   r`   r   r   r   r|  required_fieldsmissing_fieldsrL   s             @rT   r   z ModelOptNvFp4Config._from_config9  s     *1L)@&J * 	n.O.O*>:LVVVO   #2  N   >-;> >  
 s*!	
 
 	
rU   )r{  )rV   rW   rX   rY   r   r   r   r   rQ   r"   r   r   r   r   r   r   r   r   r   r   rZ   r[   s   @rT   rz  rz    s       (( ; ;(,; !4Z; c	;
 ; 
; ; ; ; ; ;$-    A$u{*; A A A A 3    [ 	t	#   [> "
 "
  #Tz	"

 c"
 c3h"
 $J"
 "
 
"
 "
 "
 ["
 "
 "
 "
 "
rU   rz  c                       e Zd ZdZdeddfdZdej        j        de	de
e	         d	e	d
e	dej        fdZdeddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )ModelOptNvFp4LinearMethoda{  Linear method for Model Optimizer NVFP4.
    Supports loading NVFP4 checkpoints with the following structure:

    input_scale: torch.float32, scalar ,
    weight: NVFP4(represented as byte) Shape: [1, X, y/2]
    weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
    weight_scale_2: torch.float32, scalar,
    Args: quant_config: The ModelOpt quantization config.
    rL   rc   Nc                    || _         d | _        d| _        t          j        Bt                      rd| _        nt                      rd| _        nt                      rd| _        nt          j                            d          r2t          j        | _        t                      sJ d| j                     not          j        dk    r(d| _        t                      sJ d| j                     n7t          j        dk    r'd| _        t                      sJ d| j                     | j        dk    rt          d	          t                              d
| j         d           d S )Nnonezflashinfer-cutlasscutlassrr   flashinfer-zFlashInfer is required for zCutlass is required for zMarlin is required for zINo valid NVFP4 GEMM backend found. Please check your platform capability.zUsing z for NVFP4 GEMM)rL   rv   rp   envsVLLM_NVFP4_GEMM_BACKENDrD   r4   r1   rg   r   r   	info_oncer   s     rT   rQ   z"ModelOptNvFp4LinearMethod.__init__j  sw   ("&'/ (3&(( (((** (')44]CC 	W7DL!##QQ%Q4<%Q%QQQ#Q)Y66$DL(**UU,Ut|,U,UUU*U)X55#DL*,,VV.V.V.VVV,<6!!9  
 	?$,???@@@@@rU   rl   r   r   r   r   r   c                    ~~| j         j        st          d          t          |          }|                    d          }	||_        ||_        ||_        |dz  dk    rt          d          | j         j        rt          j	        n|}
t          t          j        |j        |j        dz  t          j                  dd|		          }|                    d
|           t          t          j        t          |          t          j                  |	          }|                    d|           t          t          j        t          |          t          j                  |	          }|                    d|           t          t          j        ||| j         j        z  |
          dd|		          }|                    d|           d S )NzHNVFP4 quantization was selected,  dynamic quantization is not supported.r   r{  r   z=Unsupported model when in features size is not multiple of 16r{   r   r   r   r   r   r   weight_scale_2r   )rL   r|  r   r   r   r   r   r   r   r   r@   r   uint8r   rA   re   r   r   )rR   rl   r   r   r   r   r   r   r   r   r   r   r   r  r   s                  rT   r   z(ModelOptNvFp4LinearMethod.create_weights  s     ? 	:   %((>$?$?!*..??5)A&*C'#b(A--O    ?E 	 &/.!3k	   '

 

 

 	  6222 .S!788NNN'
 
 
 	  <<< 1S!788NNN'
 
 
 	  !1>BBB ,)(D,=,HH"  
 '	
 	
 	
 	  >>>>>rU   c                    |j                                                             t          j                  }t          |d          |_         |j                                                            t          j                  }t          |d          |_        t          |j         |j        z  d          |_        t          d|j         z                      t          j                  d          |_        |j	        j
        t          j        k    s
J d            | j        dk    rt          |           |`|` d S | j        dk    rddlm}m} |j        j        }|j	        j        }d	} ||                    t          j                  |          } ||                    t          j                  |                              |j                                      t          j                  }t          |d          |_	        t          |d          |_        d S t/          |j	                  }	t          |	d          |_	        t          |j        j        d          |_        d S )
NFr   r   z2Weight Block scale must be represented as FP8-E4M3rr   zflashinfer-trtllmr   )shuffle_matrix_ashuffle_matrix_sf_ar	  )r   r   tor   r   r   r  alphainput_scale_invr   r   r   rp   r2   
flashinferr  r  r   r   viewr  reshaper  r;   )
rR   rl   input_scale_2r  r  r  r   r   epilogue_tile_mswizzled_weight_scales
             rT   r   z7ModelOptNvFp4LinearMethod.process_weights_after_loading  sA   )--//225=AA%m5III-113366u}EE(uMMM 44E
 
 

 !*""&&u}55U!
 !
 !
 !'5+>>>>@ ?>> <8##(///!!!\000
 IHHHHHHH\&F -2L!O%%fkk%+&>&>PPF##L$5$5ek$B$BOTT+,,e)**  "+<u!M!M!ME$V5AAAELLL$6u7I$J$J!!*+@PU!V!V!VE$U\%6eLLLELLLrU   r   r   c                     | j         dk    r;t          ||j        |j        |j        |j        |j        |j        || j        	  	        S |j	        }|j
        d         |j        j
        d         g}t          ||j        d| j                   \  }}|j	        t          j        k    sJ |j        j	        t          j        k    sJ |j	        t          j        k    sJ |j        j	        t          j        k    sJ |j        j	        t          j        k    sJ ||j        ||j        |j        |f}| j                             d          r)| j         t'          d          d          }	t)          |d|	i}
n| j         dk    sJ t+          | }
||
|z   }
 |
j        | S )	Nrr   )	r  r   r   r  	workspacesize_nsize_kr   input_dtyper   T)is_sf_swizzled_layoutrp   r  rp   r  )rp   r0   r   r   r  r  r   r   rv   r   r  r
   r  r   r  r   r  r   rg   re   rC   r	   r  )rR   rl   r   r   output_dtypeoutput_shapex_fp4x_blockscalemm_argsbackend_nameouts              rT   r   zModelOptNvFp4LinearMethod.apply  s    <8##*|"/$3/65 3
 
 
 
 w
EL$6q$9: /u$D$,
 
 
| {ek))))|!U[0000!U%88888!'5+>>>>>{ EM1111 LK
 <""=11 	2<M(:(:(<(<=L*GJ\JJCC<9,,,,'1C*Csx&&rU   rO   )rV   rW   rX   rY   rz  rQ   r   r   r   r   r   r   r   r   r  r   r   rU   rT   r  r  _  s&        A%8 AT A A A A<H?xH? #&H? !%S		H?
 H? H? kH? H? H? H?T3M6 3Md 3M 3M 3M 3Mr %)	4' 4'x4' <4' lT!	4'
 
4' 4' 4' 4' 4' 4'rU   r  c                       e Zd ZdZdededdf fdZedej	        dz  fd            Z
	 d deej        ej        ej        f         dz  dej        dz  f fd	Zd
ej        dej        j        dej        fdZdefdZdej        j        dedededej	        f
dZdej        j        ddfdZed             Zdedej        dej        deej        eej                 f         fdZdej        j        dedz  fdZedefd            Zedefd            Z dedej        dej        dej        eej        ej        f         z  fdZ!dedej        dej        dej        dej        eej        ej        f         z  f
dZ" xZ#S )!ModelOptNvFp4FusedMoEz]
    MoE Method for FP4 Quantization.
    Args:
        quant_config: NVFP4 Quant Config
    rL   rs   rc   Nc                     t                                          |           || _        t          | j        t
          t                    \  | _        | _        d | _	        t          | j                  | _        d S r"  )rP   rQ   rL   r   r%  r:   r9   nvfp4_backendr'  r(  r   use_global_sfr)  s      rT   rQ   zModelOptNvFp4FusedMoE.__init__E  sv    
 	$$$(/G8#(0
 0
 0
,D, 8<E
 
rU   c                 P    | j         | j         j                                        S d S rO   r+  r   s    rT   r-  z(ModelOptNvFp4FusedMoE.topk_indices_dtypeZ  r.  rU   r/  c                 :   | j         t          j        k    rd S | j         t          j        k    rN| j        j        j        sd S t          | j                  }t          	                    d|j
        j                   |S t                                          |          S )Nr2  )r  r   r3  r4  r%  r5  r6  r'   r   r7  rS   rV   rP   r8  r9  s      rT   r8  z1ModelOptNvFp4FusedMoE.maybe_make_prepare_finalize`  s     !BBB4?#EEE8/C tP    d$4$>$GHHH##7766~FFFrU   r,  rl   c                 j    | j         J | j        J t          | j        | j         | j        |          S r;  )r<  r'  r   r%  r=  s      rT   r>  z&ModelOptNvFp4FusedMoE.select_gemm_impls  sM    
 $000+++,x.(-	
 
 
 	
rU   c                     dS )zY
        FP4 variants use 'weight_scale_2' pattern for per-tensor weight scales.
        Tr   r   s    rT   uses_weight_scale_2_patternz1ModelOptNvFp4FusedMoE.uses_weight_scale_2_pattern  s	     trU   r?  r@  rA  r   c                    | j         j        sJ ||_        ||_        | j         |_         t          j        }t          j        }|                    d          }	|                    d          }
| j        j	        rdnd}t          t	          j        |||z  |dz  |          dd|	          }|                    d|           t          t	          j        |||dz  |          dd|	          }|                    d|           t          t	          j        |||z  || j         j        z  |          dd|	          }|                    d	|           t          t	          j        |||| j         j        z  |          dd|	          }|                    d
|           |                    dt          j        j        i           t%          t	          j        ||t          j                  |	          }|                    d|           t%          t	          j        |t          j                  |	          }|                    d|           |                    dt          j        j        i           | j        r|
n|}t%          t	          j        ||t          j                  |	          }|                    d|           t%          t	          j        |t          j                  |	          }|                    d|           d S )Nr   rb  r{   r   r   r   rC  rD  rE  rF  rw   r   w13_weight_scale_2w2_weight_scale_2rG  rH  )rL   r|  r?  r   r   r  r   r   r%  rJ  r@   r   r   r   updater   BLOCKvaluerA   r   TENSORr  )rR   rl   r?  r@  rA  r   r   r   weight_scale_dtyper   rb  rL  rC  rD  rE  rF  r  r  global_sf_num_expertsrG  rH  s                        rT   r   z$ModelOptNvFp4FusedMoE.create_weights  s     ????')!.{"0*..??/334HII"h5<1)!@@q "   '
 
 

 	  z::: )/14"   '
 
 
	 	  i888/!@@t0;;(   '
 
 
 	  !35EFFF./43D3OO(   '
 
 
 	  !2ODDD!!9?EF	
 	
 	
 5[.NNN'
 
 
 	  !57IJJJ3[>>>'
 
 
 	  !46GHHH!!9@FG	
 	
 	

 #'"4E+ 	 2%m  
 (
 
 
 	  !2ODDD02%-HHH'
 
 
 	  !1>BBBBBrU   c                    | j         j        rMt          j        |j        dddf         |j        dddf                   st
                              d           |j        dddf                                         }t          | j	        ||j
        |j        ||j        |j        |j        |j        |j        | j         j                  \  }}}}}}}	}
t#          |d|           t#          |d|           t#          |d|           t#          |d	|           t#          |d
|           t#          |d|           t#          |d|	           t#          |d|
           |                     |          | _        | j        rS| j         j        j        r| j         j        j        r3| j        J t1          | j        | j         | j                  | _        dS dS dS )zT
        Convert NVFP4 MoE weights into kernel format and setup the kernel.
        Nr   r   zIw1_weight_scale_2 must match w3_weight_scale_2. Accuracy may be affected.)r  rl   rM  rO  w13_scale_2	a13_scalerN  rP  
w2_scale_2rZ  rJ  rC  rE  r  rG  rD  rF  r  rH  )r<  rs   r'  )r%  rJ  r   allcloser  r   warning_oncer  r   r  rC  rE  rG  rD  rF  r  rH  rB   rR  r<  r5  r6  use_naive_all2all_kernelsr'  r   r(  )rR   rl   r  rM  rO  r  r  rN  rP  r  rZ  s              rT   r   z3ModelOptNvFp4FusedMoE.process_weights_after_loading  s%    8" 	5>$QQQT*E,DQQQT,J,
 ,
 	 ,   #5aaad;FFHH /, ,*+*.)82
 
 
		
 	%s333%!3Y???%!5{CCC%!2I>>>%b111%!2H===%!4jAAA%!18<<< !% ? ? F F  		-A		x+E		 #////!%!68 ,  DKKK		 		 		 		rU   c                 ,    | j         t          j        k    S rO   )r  r   r3  r   s    rT   do_post_quant_allgatherz-ModelOptNvFp4FusedMoE.do_post_quant_allgather:  s    !_%FFFrU   r`  r]  c                     | j         t          j        k    rt          d          ddl}|                    ||j        d          \  }}|g}||fS )zBOptionally prepare extra tensors to carry through DP allgather/EP.zVprepare_dp_allgather_tensor is only supported for FlashInfer TRTLLM NVFP4 MoE backend.r   NF)r  )r  r   r3  RuntimeErrorr  fp4_quantize	a1_gscale)rR   rl   r`  r]  r  hidden_states_fp4hidden_states_sfextra_tensorss           rT   prepare_dp_allgather_tensorz1ModelOptNvFp4FusedMoE.prepare_dp_allgather_tensor>  s|     !BBB7  
 	.8.E.EO"' /F /
 /
++
 .>,> -//rU   c           	      t    t          | j        |j        |j        |j        |j        |j        |j                  S )N)rp   rO  rP  r  r  r  rZ  )r   r  rE  rF  r  r  rG  rH  r  s     rT   rR  z0ModelOptNvFp4FusedMoE.get_fused_moe_quant_configU  sE     +&,*0.+)
 
 
 	
rU   c                     dS )NTr   r   s    rT   supports_eplbz#ModelOptNvFp4FusedMoE.supports_eplbb  s    trU   c                 P    | j         t          j        k    o| j        j        j         S rO   )r  r   r3  r%  r5  rg  r   s    rT   r\  z#ModelOptNvFp4FusedMoE.is_monolithicf  s*     /"CC =H0<<	
rU   r   c                     | j         sJ | j        t          j        k    r|j        rJ t          ||||j        |j        |j        |j	        |j
        |j        |j        
  
        S )N)
rl   r   r]  rc  rh  rb  rd  re  custom_routing_functionrj  )r\  r  r   r3  rg  r(   rc  rh  rb  rd  re  r  rj  rk  s       rT   rl  z&ModelOptNvFp4FusedMoE.apply_monolithicm  s     !!!!/"CCC% DC & )'+'$7"3'$)$A$)$A
 
 
 	
rU   rm  rn  c                 &   | j         rJ | j        t          j        k    r.|j        sJ t          |||||j        |j        |j                  S | j	        J | 	                    ||j
        |j        ||d|j        |j        |j        |j        
  
        S )N)rl   r   rn  rm  rc  rh  rb  Frq  )r\  r  r   r3  rg  r)   rc  rh  rb  r(  rC  rD  rs  rf  rt  s        rT   r   zModelOptNvFp4FusedMoE.apply  s     %%%% !BBB$$$$3!)k +#(#;    ;***;;  +#(#; +-2-O    rU   rO   )$rV   rW   rX   rY   rz  r   rQ   ru  r   r   r-  r  r  rv  rw  r8  r   r   rx  r>  r   r  r   r   r   r  r   r   r  r   rR  r  r\  rl  r   rZ   r[   s   @rT   r  r  >  sj        
)
 #
 
	
 
 
 
 
 
* EK$$6    X RVG GelEL%,FG$NG 
	&	-G G G G G G&
7
 x
 
	+	
 
 
 
T    rCxrC rC 	rC
 *-rC krC rC rC rCh=58? =t = = = =~ G G XG00 |0 |	0
 
u|T%,//	00 0 0 0.
X_
	t	#
 
 
 
 t    X 
t 
 
 
 X


 <
 |	

 
elEL89	9
 
 
 
2"" <" l	"
 ," 
elEL89	9" " " " " " " "rU   r  )wr   typingr   r   r   r   torch.nnr   torch.nn.parameterr   	vllm.envsr  3vllm.model_executor.layers.fused_moe.modular_kernelmodel_executorlayers	fused_moemodular_kernelrv  vllm._custom_opsr	   r
   vllm.attention.layerr   vllm.loggerr   +vllm.model_executor.layers.fused_moe.configr   r   *vllm.model_executor.layers.fused_moe.layerr   r   r   /vllm.model_executor.layers.fused_moe.oracle.fp8r   r   r   r   r   r   1vllm.model_executor.layers.fused_moe.oracle.nvfp4r   r   r   r   r   r   r   !vllm.model_executor.layers.linearr   r    r!   'vllm.model_executor.layers.quantizationr"   3vllm.model_executor.layers.quantization.base_configr#   r$   9vllm.model_executor.layers.quantization.kernels.scaled_mmr%   0vllm.model_executor.layers.quantization.kv_cacher&   @vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moer'   r(   r)   >vllm.model_executor.layers.quantization.utils.flashinfer_utilsr*   r+   7vllm.model_executor.layers.quantization.utils.fp8_utilsr,   r-   r.   :vllm.model_executor.layers.quantization.utils.marlin_utilsr/   >vllm.model_executor.layers.quantization.utils.marlin_utils_fp4r0   r1   r2   9vllm.model_executor.layers.quantization.utils.quant_utilsr3   r4   r5   r6   r7   r8   r9   r:   r;   8vllm.model_executor.layers.quantization.utils.w8a8_utilsr<   r=   vllm.model_executor.parameterr>   r?   r@   rA   vllm.model_executor.utilsrB   vllm.utils.flashinferrC   rD    vllm.model_executor.models.utilsrE   rV   r   r   KV_CACHE_QUANT_ALGOSrK   rM   r   r   r   r   r   r]   r^   r_   rz  r  r  r   rU   rT   <module>r     s         / / / / / / / / / /        ( ( ( ( ( (       @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ D D D D D D D D * * * * * * # # # # # #                
                                         
 H G G G G G             O N N N N N         
                
              

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
                   8 7 7 7 7 7       
  ?>>>>>>	X			 	 	 w ' ' ' ' '0 ' ' 'H
 H
 H
 H
 H
0 H
 H
 H
V^
 ^
 ^
 ^
 ^
/ ^
 ^
 ^
BV= V= V= V= V=. V= V= V=rH= H= H= H= H="2 H= H= H=Vt
 t
 t
 t
 t
"2 t
 t
 t
ng
 g
 g
 g
 g
- g
 g
 g
T	 %<  !&:  #%=  "b
 b
 b
 b
 b
1 b
 b
 b
J\' \' \' \' \' 0 \' \' \'~j j j j j. j j jZ '@  #(=  %'?  $ $ $rU   