
    .`i                     $   d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZmZmZmZ d dlmZmZmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB  eeC          ZD G d de          ZEdeEfd ZFd!eGdeEfd"ZH G d# d$e)          ZI G d% d&e          ZJ G d' d(eJ          ZKdS ))    )Enum)OptionalN)	Parameter)envs)	Attention)get_current_vllm_config)init_logger)FusedMoEFusedMoEConfigFusedMoEMethodBase)modular_kernel)FusedMoEQuantConfigmxfp4_mxfp8_moe_quant_configmxfp4_w4a16_moe_quant_configocp_mx_moe_quant_config)BatchedMarlinExpertsMarlinExpertsfused_marlin_moe)OAITritonExpertsUnfusedOAITritonExperts)TrtLlmGenExperts)
LinearBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)get_marlin_input_dtype) prepare_moe_fp4_layer_for_marlin)_can_support_mxfp4_swizzle_mxfp4get_padding_alignment)is_layer_skipped)set_weight_attrs)current_platform)scalar_types)has_flashinfer)has_triton_kernels)round_up)is_torch_equal_or_newerc                   *    e Zd ZdZdZdZdZdZdZdZ	dS )	Mxfp4Backendr                     N)
__name__
__module____qualname__NONESM100_FI_MXFP4_MXFP8_TRTLLMSM100_FI_MXFP4_MXFP8_CUTLASSSM100_FI_MXFP4_BF16SM90_FI_MXFP4_BF16MARLINTRITON     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/mxfp4.pyr+   r+   A   s;        D #$#$  F FFFr=   r+   returnc                  t   t          j                    st          j        S t	                      o/t          d          o dt          j                    cxk    odk     nc } t          j        du r(| r&t          
                    d           t          j        S t          
                    d           t          j        S )zg
    Not all MXFP4 backends support LoRA. Select backends that are known to
    have LoRA support.
    2.8.0	   r      r   Fz2[get_mxfp4_backend_with_lora] Using Triton backendz2[get_mxfp4_backend_with_lora] Using Marlin backend)r$   is_cudar+   r5   r'   r)   get_device_capabilityr   VLLM_MXFP4_USE_MARLINlogger	info_oncer;   r:   )triton_kernels_supporteds    r>   get_mxfp4_backend_with_lorarL   Q   s    
 #%% !   	 	I#G,,	I
 &<>>HHHHHHHH  !U**/G*MNNN""
IJJJr=   with_lora_supportc                    | rt                      S t          j                    rt          j        d          r@t	                      r2t
          j        r&t                              d           t          j
        S t          j        d          r@t	                      r2t
          j        r&t                              d           t          j        S t          j        d          r&t	                      rt
          j        rt          j        S t          j        d          r4t	                      r&t                              d           t          j        S t          j        d          st          j        d          r(t	                      st                              d           t%                      o/t'          d          o dt          j                    cxk    od	k     nc }t
          j        s|s&t                              d
           t          j        S t                              d           t          j        S t          j                    r&t                              d           t          j        S t          j                    r4t%                      r&t                              d           t          j        S t          j        S )NZ   z,Using FlashInfer MXFP4 BF16 backend for SM90d   z6Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100zUsing FlashInfer MXFP4 BF16 backend for SM100, For faster performance on SM100, consider setting VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact accuracy.zMXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer is not available. This may result in degraded performance. Please `pip install vllm[flashinfer]` for best results.rA   rB   rD   zUsing Marlin backendzUsing Triton backendz Using ipex marlin backend on XPU)rL   r$   rF   is_device_capabilityr&   r   "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16rI   rJ   r+   r9   is_device_capability_family+VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASSr7   #VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8r6   r8   warning_oncer'   r)   rG   rH   r:   r;   is_xpuis_rocmr5   )rM   rK   s     r>   get_mxfp4_backendrY   j   s     -*,,,!! ;#1"55$	  $	 7$	
 KLLL228==	  	 @	
 UVVV<<8==	  	 8	
  ;;9#>> 	>CSCS 	    338==	4R88	 !""	 J      M'00M
 *@BBLLLLWLLLL 	! % 	'-E 	'3444&&3444&&		 	"	" #;<<<""		!	#	# #(:(<(< #/000""r=   c                       e Zd Zddee         dz  f fdZed             Zedefd            Z	ede
fd            Zedeej                 fd            Zedee         fd	            Zd
ej        j        deded         fdZ xZS )Mxfp4ConfigNignored_layersc                 V    t                                                       || _        d S N)super__init__r\   )selfr\   	__class__s     r>   r`   zMxfp4Config.__init__   s'    ,r=   c                      |             S r^   r<   )clsconfigs     r>   from_configzMxfp4Config.from_config   s    suur=   r?   c                     dS )NP   r<   rd   s    r>   get_min_capabilityzMxfp4Config.get_min_capability   s    rr=   c                     dS )Nmxfp4r<   ri   s    r>   get_namezMxfp4Config.get_name   s    wr=   c                     t           j        gS r^   )torchbfloat16ri   s    r>   get_supported_act_dtypesz$Mxfp4Config.get_supported_act_dtypes   s    r=   c                     g S r^   r<   ri   s    r>   get_config_filenamesz Mxfp4Config.get_config_filenames   s    	r=   layerprefixr   c                    t          |t                    r[| j        r*t          || j        | j                  rt                      S t                              dd           t                      S t          |t                    rQt          j
                    rt          |j                  S t          |j                  }t          |          |_        |S t          |t                     rt                              dd           d S )N)ru   r\   fused_mappingzPMXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod.local)scopezOMXFP4 attention layer is not implemented. Skipping quantization for this layer.)
isinstancer   r\   r"   packed_modules_mappingr   rI   
debug_oncer
   r$   rW   IpexMxfp4MoEMethod
moe_configMxfp4MoEMethodr   marlin_input_dtyper   )ra   rt   ru   quant_methods       r>   get_quant_methodzMxfp4Config.get_quant_method   s"    eZ(( 	" 1'7#2"9( ( ( 1
 /000 +    
 +,,,x(( 	&(( $)%*:;;;-e.>??2H2P2P/##y)) 	8    
 tr=   r^   )r2   r3   r4   liststrr`   classmethodrf   intrj   r   rm   ro   dtyperq   rs   nnModuler   r   __classcell__rb   s   @r>   r[   r[      sK       - -tCy4'7 - - - - - -   [ 3    [ ,    [  ek):       [  T#Y    [!X_!.1!	&	'! ! ! ! ! ! ! !r=   r[   c                       e Zd Zdef fdZdej        j        dedededej	        f
dZ
d	 Zdej        j        d
edz  fdZdej        dej        j        d
ej        fdZed
efd            Zed
efd            Zdedej        dej        dej        d
ej        eej        ej        f         z  f
dZdedej        dej        d
ej        eej        ej        f         z  fdZ xZS )r   moec                 "   t                                          |           t          |j                  | _        d | _        t                      j        j        | _	        | j        t          j        k    sJ d|j         d            i | _        d S )Nz$get_mxfp4_backend(with_lora_support=zn) foundno compatible MXFP4 MoE backend (FlashInfer/Marlin/Triton).Please check your environment and try again.)r_   r`   rY   is_lora_enabledmxfp4_backendr   r   compilation_configmax_cudagraph_capture_sizemax_capture_sizer+   r5   _cache_permute_indices)ra   r   rb   s     r>   r`   zMxfp4MoEMethod.__init__   s    .s/BCC"&#%%8S 	 !\%6666;33F ; ; ; 766
 GI###r=   rt   num_expertshidden_sizeintermediate_size_per_partitionparams_dtypec                    || _         t          j        }t          j        }d}	|}
| j        t          j        k    rat          |d          }
t          j                    rt          |d          }nt          |d          }||_	        ||_         ||_
        |
|_        n| j        t          j        k    s| j        t          j        k    r!t          |d          }
t          |d          }n| j        t          j        k    s| j        t          j        k    r!t          |d          }
t          |d          }nRt          j                    r/t#                      }t          ||          }
t          ||          }nt          |d          }
|
| _        || _
        t          j                            t          j        |d|
z  |dz  |          d          }|                    d	|           t/          ||           t          j                            t          j        |d|
z  ||	z  |          d          }|                    d
|           t/          ||           t          j                            t          j        |d|
z  t          j                  d          }|                    d|           t/          ||           t          j                            t          j        |||
dz  |          d          }|                    d|           t/          ||           t          j                            t          j        |||
|	z  |          d          }|                    d|           t/          ||           t          j                            t          j        ||t          j                  d          }|                    d|           t/          ||           d S )N          @   r-   r   Frequires_grad
w13_weightw13_weight_scalew13_bias	w2_weightw2_weight_scalew2_bias)r   ro   uint8r   r+   r:   r(   r$   rW   r   r   r   r6   r8   r7   r9   rX   r!   intermediate_sizer   r   zerosregister_parameterr#   rp   )ra   rt   r   r   r   r   extra_weight_attrsweight_dtypescale_dtypemxfp4_block)intermediate_size_per_partition_after_pad	pad_alignr   r   r   r   r   r   s                     r>   create_weightszMxfp4MoEMethod.create_weights   s[    '{k 4S1!444 9A/9 95  &(( 9&{C88&{C88!-E +E +E9 11 ,"JJJ!\%EEE
 9A/9 95 #;44KK,"KKK!\%DDD8@/9 95 #;44KK%'' 		-//I8@/9 95 #;	::KK8@/9 95 "K&X''K==q "	     ( 
 

 	  z:::%7888 8--K=={*!	     . 
 
 	  !35EFFF)+=>>>8%%K==n  
   & 
 
 	  X666#5666 H&&K9Q>"	     ' 
 
	 	  i888$6777(,,K9[H!	     - 
 
 	  !2ODDD*<===($$Kn  
   % 
 
 	  G444"455555r=   c           	      t&   | j         t          j        k    rt          || j                   d S | j         t          j        k    s| j         t          j        k    	rddlm} ddl	m
} t          t          j        dg| j        z  t          j                                                  d          |_        t          t          j        d	g| j        z  t          j                                                  d          |_        t          t          j        d
g| j        z  t          j                                                  d          |_        d}|j                                        dk    rW|j        j        d         | j        k    r<|j        j        d         | j        dz  k    r|j        j        d         | j        dz  k    sJ |j                                        dk    rW|j        j        d         | j        k    r<|j        j        d         | j        dz  k    r|j        j        d         | j        |z  k    sJ |j                                        dk    rT|j        j        d         | j        k    r9|j        j        d         | j        k    r|j        j        d         | j        dz  k    sJ |j                                        dk    r9|j        j        d         | j        k    r|j        j        d         | j        |z  k    sJ |j                                        dk    r9|j        j        d         | j        k    r|j        j        d         | j        dz  k    sJ |j                                        dk    r6|j        j        d         | j        k    r|j        j        d         | j        k    sJ |j        j        }|j        j        }|j        j        }|j        j        }|j        j                            t          j                  }	|j        j                            t          j                  }
d!d} ||d          } ||d          } ||	d          }	g }g }g }g }g }g }d}tA          | j                  D ]} || j!        ||         "                    t          j#                  |          }|$                    ||         "                    t          j#                  |                    |j%                           &                                            || j!        ||         "                    t          j#                  |d          }|$                     |||         "                    t          j#                  |                    |j%                           &                                                      || j!        |	|         '                                (                    dd          |          }|$                    |	|         '                                (                    dd          |                    |	j%                           &                                            || j!        ||         "                    t          j#                  |          }|$                    ||         "                    t          j#                  |                    |j%                           &                                            || j!        ||         "                    t          j#                  |d          }|$                     |||         "                    t          j#                  |                    |j%                           &                                                      || j!        |
|         '                                (                    dd          |          }|$                    |
|         '                                (                    dd          |                    |
j%                           &                                           t          j)        |          }t          j)        |          (                    | j        d| j        z  | j        |z            "                    t          j*                  }t          j)        |          }t          j)        |          (                    | j        | j        | j        |z            "                    t          j*                  }t          |d          |_        t          |d          |_        t          |d          |_        t          |d          |_        t          t          j)        |          (                    | j        d          d          |_        t          t          j)        |          (                    | j        d          d          |_        d S | j         t          j+        k    s| j         t          j,        k    r_t          t          j        dg| j        z  t          j                                                  d          |_        t          t          j        d	g| j        z  t          j                                                  d          |_        t          t          j        d
g| j        z  t          j                                                  d          |_        d}|j                                        dk    rW|j        j        d         | j        k    r<|j        j        d         | j        dz  k    r|j        j        d         | j        dz  k    sJ |j                                        dk    rW|j        j        d         | j        k    r<|j        j        d         | j        dz  k    r|j        j        d         | j        |z  k    sJ |j                                        dk    rT|j        j        d         | j        k    r9|j        j        d         | j        k    r|j        j        d         | j        dz  k    sJ |j                                        dk    r9|j        j        d         | j        k    r|j        j        d         | j        |z  k    sJ |j                                        dk    r9|j        j        d         | j        k    r|j        j        d         | j        dz  k    sJ |j                                        dk    r6|j        j        d         | j        k    r|j        j        d         | j        k    sJ |j        j        }|d d d d dd d f         |d d dd dd d f         }}t          j-        ||gd          }t          j.        |dd          \  }}t          j-        ||gd          }|j        j                            t          j                  }|d d d d df         |d d dd df         } }t          j-        || gd          }!t          j.        |!dd          \  }"}#t          j-        |#|"gd                              t          j/                  }$|j        j        }%|%d d d d dd d f         |%d d dd dd d f         }'}&t          j-        |&|'gd          }(t          j.        |(dd          \  })}*t          j-        |*|)gd          }+| j         t          j+        k    rddl0m1}, |+j        }- |,|+"                    t          j#                            (                    |-          }.|j        j        }/|/j        }- |,|/"                    t          j#                            (                    |-          }0t          |d          |_        t          |.d          |_        t          |$d          |_        t          |0d          |_        d S | j         t          j,        k    rMd }1|+                    t          j#                  "                    t          j#                  }2 |1|2          }3|j        j        }|                    t          j#                  "                    t          j#                  }4 |1|4          }5t          j2                            t          j-        ||gd          d          |_        t          j2                            |$d          |_        t          j2                            |3d          |_        t          j2                            |5d          |_        d S d S | j         t          j3        k    rEddl4m5}6m6}7 |j                            t          j                  }	|j                            t          j                  }
t          |	d          |_        t          |
d          |_        | j7        j8        p| j7        j9        }8|8rtt          j;        dk    rdnd}9nd}9ty          |j        |j        |9          \  }}:};ty          |j        |j        |9          \  }}<}= |7|; |6|:                    | _=         |7|= |6|<                    | _>        || _        || _        |`|`||_        ||_        d S t          d| j          dt          t                     d           )"N)input_dtyper   )nvfp4_block_scale_interleave)!get_w2_permute_indices_with_cachegZd;?r   Fr   g      ?g      @r   r.   r,   r-   c                 "   | j         }|dk     rt          |          |z   }t          |          }||         dz  ||<   |                    |dz   d            | j        | } |                     |dz             } t          |          } | j        | S )Nr   r-   r,   )shapelenr   insertreshapeflip)xaxisr   	new_shapes       r>   swap_every_two_rowszIMxfp4MoEMethod.process_weights_after_loading.<locals>.swap_every_two_rows  s    !88u::,D !KK	"'+"2	$  1--- AIy)FF4!8$$ KK	 qy),,r=   r      )num_elts_per_sf)dim)block_scale_interleavec                     | j         }|                     |d         |d         |d         dz  d          }|                    dddd          }|                    |d         |d         dz  |d         dz            }|S )Nr   r,   r-   r/   r.   )r   r   permute)ww_shapew_interleaveds      r>   _interleave_mxfp4_cutlass_sm90zTMxfp4MoEMethod.process_weights_after_loading.<locals>._interleave_mxfp4_cutlass_sm90  s    gG$%II
GAJq1% %M %2$9$9!Q1$E$EM$1$9$9
GAJ!OWQZ!^% %M )(r=   )FlexCtxPrecisionConfigi   r/      )rhs_data)weight_scaleflex_ctxzUnsupported mxfp4_backend: z: should be one of: .)r   )Ar   r+   r:   r   r   r6   r8   flashinfer.fp4_quantizationr   flashinfer.fused_moe.corer   r   ro   tensorr   float32cudagemm1_alpha
gemm1_betagemm1_clamp_limitr   r   r   r   r   r   r   r   r   r   datatoranger   viewr   appenddevice
contiguouscloner   stackfloat8_e4m3fnr7   r9   catchunkrp   
flashinferr   r   r;   triton_kernels.matmul_ogsr   r   r   use_pplx_kernelsuse_deepep_ll_kernelsr   VLLM_MOE_DP_CHUNK_SIZEr    w13_precision_configw2_precision_config
ValueErrorr   )>ra   rt   r   r   sf_block_sizer   r   r   r   r   r   r   gemm1_weights_mxfp4_shuffledgemm1_scales_mxfp4_shuffledgemm2_weights_mxfp4_shuffledgemm2_scales_mxfp4_shuffledgemm1_bias_shuffledgemm2_bias_shuffledepilogue_tile_mipermute_indicespermute_sf_indicespermute_bias_indicesw13_wgate_wup_wdeinterleaved_w13_ww1_ww3_ww13_weight_swappedw13_bgate_bup_bdeinterleaved_w13_bb1b3w13_bias_swappedw13_sgate_sup_sdeinterleaved_w13_ss1s3w13_scale_swappedr   
orig_shapew13_scale_interleavedw2_sw2_scale_interleavedr   
w31_scalesw31_scales_interleaved	w2_scalesw2_scales_interleavedr   r   is_batched_moe	num_warpsw13_flex	w13_scalew2_flexw2_scales>                                                                 r>   process_weights_after_loadingz,Mxfp4MoEMethod.process_weights_after_loading  sD   !444,U@WXXXXXX,"JJJ!\%EEEPPPPPPSSSSSS )eWt'77u}MMMRRTT#! ! !E  )cUT%55U]KKKPPRR#     E '0cUT%55U]KKKPPRR#' ' 'E# M  $$&&!++$*1-1AAA$*1-1G!1KKK$*1-1AQ1FFFFG &**,,11*03t7GGG*03t7MPQ7QQQ*03t7G=7XXXXY ##%%**O)!,0@@@O)!,0@@@O)!,0F!0KKKKL %))++q00)/2d6FFF)/2)]:; ; ;; ""$$))N(+t/???N(+t/E/IIIIJ
 !!##q((M'*d.>>>M'*d.>>>>?
  %5:#38O).J,I~*--em<<Hm(++EM::G- - - -   323CRHH,,Z<<J**8R88H
 ,.(*,'+-(*,'"$"$!O4+,, N N"C"C/qM&&u{33## #
 -33qMT%+&&'9'9*:K'L'LNZ\\   &G%F/$Q',,U[99#$&	& & &" ,2200(+ek**.112B2IJJ $    (I'H/QK%%''//A66#( ($
 $**QKUWWWR^^$8$;$;HO$L$LN  Z\\	   #D"C/aL%%ek22## #
 -33aLT%+&&'9'9):J'K'KMZ\\   &G%F/#A&++EK88#$&	& & &" ,2200'*ek**.11/2HII $    #D"C/AJ$$&&..r155## #
 $**AJUWWWR^^O$6$6w~$F$FH  Z\\	    %ABBJ788$..$5 
 e)**  $@AAI788$$*m; 
 e)**   )5IIIE%./?u%U%U%UE"'	GGGEO$-oU$S$S$SE!&/00889I2NN#  EN &/00889I2NN#  EMMM
 ,"KKK!\%DDD )eWt'77u}MMMRRTT#! ! !E  )cUT%55U]KKKPPRR#     E '0cUT%55U]KKKPPRR#' ' 'E#
 M  $$&&!++$*1-1AAA$*1-1G!1KKK$*1-1AQ1FFFFG &**,,11*03t7GGG*03t7MPQ7QQQ*03t7G=7XXXXY ##%%**O)!,0@@@O)!,0@@@O)!,0F!0KKKKL %))++q00)/2d6FFF)/2)]:; ; ;; ""$$))N(+t/???N(+t/E/IIIIJ
 !!##q((M'*d.>>>M'*d.>>>>? $)E CCaC+U111addAAA:->DF"')VTN"B"B"B%8!CCCJD$!&D$<Q!?!?!?N'**5=99E CCaC=%14a4.DF"')VTN"B"B"B[!4aR@@@FB$y"br:::==enMM*/E CCaC+U111addAAA:->DF"')VTN"B"B"B[!4aQ???FB %	2r( : : :!\%NNN======.4
(>(>%**5;77) )'*%% & ,1!Z
'='=IIek**( ('*%% % $--?u#U#U#U )2)* * *& "++;5!Q!Q!Q(1() ) )%%% #|'FFF	) 	) 	) /11%+>>CCEKPP
)G)G
)S)S&"'"7"<+..u{;;@@MM	(F(Fy(Q(Q%#(8#5#5ItTl222% $6 $ $  "'!3!3$E "4 " " */););*% *< * *& ).(:(:) ); ) )%%%; GF@ <#666JJJJJJJJ~((77Hm&&u}55G&xuEEEEN%gUCCCEM "X6X$(:XN !%!<!C!CAA			.< %"8)/ /+J) ,:!6	, ,(Iw )8&(1K1K1K) ) )D% (7%0I0I0I( ( (D$ )DO&DN )E'EOOO;d.@ ; ;%),%7%7; ; ;  r=   r?   Nc                 T   | j         t          j        k    r't          |j        |j        |j        |j                  S | j         t          j        k    r+| j	        }| j
        }t          |j        |j        ||          S | j         t          j        t          j        fv r't          |j        |j        |j        |j                  S | j         t          j        fv r't          |j        |j        |j        |j                  S |j        }|j        }t          d|j        |j        ||          S )N)w1_biasr   w1_scaler  rl   )quant_dtyper   r   r!  r  )r   r+   r:   r   r   r   r   r   r;   r   r   r6   r7   r   r8   r   )ra   rt   r!  r  s       r>   get_fused_moe_quant_configz)Mxfp4MoEMethod.get_fused_moe_quant_config  s]    !444//.	    <#6660H/H/!!	    45$
 
 
 0/.	    L$D#EEE//.	    -H,H*#!!   r=   prepare_finalizec                 4   |j         t          j        j        k    r}| j        t
          j        k    rP|                                }|J | j        J t          ||
                                | j        | j                  S t          d| j         d          | j        J | j        t
          j        k    s| j        t
          j        k    r3|j        |j        |j        | j        d}t'          | j        | j        fi |S | j        t
          j        k    rt)          | j        | j                  S | j        t
          j        k    r@| j        j        rt/          | j        | j                  S t1          | j        | j                  S t          d| j         d          )N)max_num_tokensnum_dispatchersquant_configr~   zIncompatible Mxfp4 backend (z) for EP batched experts format)r   r   r   r   z) for EP)activation_formatmkFusedMoEActivationFormatBatchedExpertsr   r+   r:   max_num_tokens_per_rankmoe_quant_configr   r'  r   NotImplementedErrorr6   r8   r   r   r   r   r   r   r;   r   r   r   )ra   r$  rt   r-  kwargss        r>   select_gemm_implzMxfp4MoEMethod.select_gemm_implF  s    .*9: : !\%888*:*R*R*T*T'.:::,888+#:$4$D$D$F$F!%!6#x	    *043E 0 0 0  
 (444"l&NNN%)III $)#4"'"2).)@(,(=  ($2GRR6RRR#|':::$TXt/DEEE#|':::8+ T248T=RSSS'$2GHHH)O43EOOO  r=   c                     dS NTr<   ra   s    r>   allow_inplacezMxfp4MoEMethod.allow_inplacex      tr=   c                     | j         t          j        k    p)| j         t          j        k    p| j         t          j        k    S r^   )r   r+   r6   r8   r;   r4  s    r>   is_monolithiczMxfp4MoEMethod.is_monolithic|  s=     ,"JJ 9!\%EE9!\%88	
r=   r   topk_weightstopk_idsc                 f   | j         rJ |j        rt          d          | j        t          j        k    rft          ||j        |j        |j	        |j
        |j        |j        ||d d t          j        j        |j        |j        |j        |j        | j                  S t+          |j        |j        |j        |j        |j        |j        |j        |j        |j        |j        j        |j        j        |j        j                  s
J d            | j        t          j         k    s| j        t          j!        k    sJ ddl"m#} | j        t          j         k    r"ddl$m%}  ||dd          \  }}tM          j'        | j(        |j)        	          }	|j        *                                +                    tL          j,                  |	|j        *                                +                    tL          j,                  |	g}
|}t[          d||j        *                                +                    tL          j.                  |j        *                                +                    tL          j.                  
          }nX| j        t          j!        k    rC|j/        tL          j0        k    sJ |j        |j        g}
|}t[          d|j        |j                  }tM          j1        |tL          j0                  } |di d|d|2                    tL          j3                  *                                d|dtL          j0        d|d|
d|j	        d|j
        d|j4        d|j5        d|j6        d| j7        j8        d| j7        j9        d| j7        j:        d| j7        j;        dty          | j=        d          | |S )NEPLB is not supported for mxfp4)global_scale1global_scale2quant_type_idapply_router_weight_on_inputglobal_num_experts
activation
expert_mapr   0MXFP4 are not supported with this configuration.r   )flashinfer_cutlass_fused_moemxfp8_quantizeTr   )r   )use_mxfp8_act_scalinginput_sffc1_expert_weightsfc2_expert_weights)use_w4_group_scalingrJ  rK  r   inputtoken_selected_expertstoken_final_scalesoutput_dtypeoutputquant_scalesfc1_expert_biasesfc2_expert_biasesswiglu_alphaswiglu_betaswiglu_limittp_sizetp_rankep_sizeep_ranktune_max_num_tokensr,   r<   )>r8  enable_eplbr/  r   r+   r:   r   r   r   r   r   r   r   r%   float4_e2m1fidr@  rA  rB  rC  r   r   use_grouped_topk
topk_groupnum_expert_groupcustom_routing_functione_score_correction_biasscoring_func
eplb_stateexpert_load_viewlogical_to_physical_maplogical_replica_countr7   r9   vllm.utils.flashinferrE  r   rG  ro   onesr   r   r   r   int32dictlongr   rp   
empty_liker   r   r   r   r   r   rX  rY  rZ  r[  maxr   )ra   rt   r   r9  r:  rE  rG  x_quantx_scalefake_input_scalerR  fi_inputextra_kwargsrQ  s                 r>   applyzMxfp4MoEMethod.apply  s2    %%%% 	I%&GHHH!444# &%""*7:-2-O#(#; + + 3#   ( """)).-42
 
 	> 	> >	> 	> 
  ,"KKK!\%DDDDE 	GFFFFF !JJJ111111-~ar::GW$z$*:18LLL&113388EE %002277DD 	L H&* #(#3#>#>#@#@#E#Eej#Q#Q#(?#=#=#?#?#D#DUZ#P#P	  LL <#BBB7en,,,, &%L
 H%)#(#3#(?  L !!5>:::$$ 	
 	
 	
(	
#+;;uy#9#9#D#D#F#F#F	
  ,|	
 		

 6	
 &	
 $nn	
 $mm	
 **	
 ((	
 00	
 H$$	
 H$$	
 H$$	
 H$$	
  !$D$91 = = =#	
 	
 	
( r=   router_logitsc                     | j         sJ |j        rt          d          t          |j        |j        |j        |j        |j        |j	        |j
        |j        |j        |j        j        |j        j        |j        j                  s
J d            | j        t$          j        k    s| j        t$          j        k    rOddlm} | j        t$          j        k    r|j        t0          j        k    sJ |}d }nc| j        t$          j        k    rNddlm}  ||d          \  }} |                    t0          j                  j        g |j        d d         dR  } ||                    t0          j                  d |||j         |j!        |j"        |j#        |j$        |j%        |j&        |j'        |j(        d d d |j)        |j*        d d | j+        |j,        |j-        z  | j.        d |j/        rdndd	ta          | j1        d          
          d         }|S | j        t$          j2        k    rCddl3m4}	  |	||j         |j&        ||j*        |j/        |j)        |j        | j5        |j
        
  
        S tm          d| j                   )Nr<  rD  r   )trtllm_fp4_block_scale_moerF  Fr   r,   T)r\  )triton_kernel_moe_forward)
hidden_statesw1w2gating_outputtopkrenormalizerA  rC  r(  r@  zUnsupported backend: )7r8  r]  r/  r   r`  ra  rb  rC  rc  rd  r@  re  rB  rf  rg  rh  ri  r   r+   r6   r8   r   ry  r   ro   rp   rG  r   r   r   r   r   r   r   r   r   r   r   r   r   r   rA  top_kr   r[  local_num_expertsr   r  rp  r   r;   ?vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moerz  r.  r   )
ra   rt   r   rw  ry  rq  rr  rG  trtllm_gen_outputrz  s
             r>   apply_monolithiczMxfp4MoEMethod.apply_monolithic  s    !!!! 	I%&GHHH!"")).-42
 
 	> 	> >	> 	> 
  ,"JJJ!\%EEE======!\%EEEw%.0000#|'OOO555555#1>!U#;#; C',,u':;;CVQWSbS\VSUVVV : :  00 &! '%(& 77 &-A$'(=q$A$A7! ! !8 9!: %$<#666      -,#?+[!-#(#; +!2-2-O    IT5GIIJJJr=   )r2   r3   r4   r   r`   ro   r   r   r   r   r   r  r   r#  r*  FusedMoEPrepareAndFinalizeFusedMoEPermuteExpertsUnpermuter1  propertyboolr5  r8  r
   Tensortuplerv  r  r   r   s   @r>   r   r      s=       IN I I I I I I U6xU6 U6 	U6
 *-U6 kU6 U6 U6 U6n@ @ @D-X_-	t	#- - - -^070 x0 
	+	0 0 0 0d t    X 
t 
 
 
 X
oo <o l	o
 ,o 
elEL89	9o o o obZKZK <ZK |	ZK
 
elEL89	9ZK ZK ZK ZK ZK ZK ZK ZKr=   r   c            
            e Zd Zdef fdZdej        j        dedededej	        f
 fdZ
dej        j        d	d
fdZed	efd            Zdedej        dej        d	ej        fdZ xZS )r}   r~   c                 X    t                                          |           || _        d S r^   )r_   r`   r~   )ra   r~   rb   s     r>   r`   zIpexMxfp4MoEMethod.__init__S  s&    $$$$r=   rt   r   r   r   r   c                 R     t                      j        |||||fi | || _        d S r^   )r_   r   original_hidden_size)ra   rt   r   r   r   r   r   rb   s          r>   r   z!IpexMxfp4MoEMethod.create_weightsW  sN     	+	
 	
 !	
 	
 	
 %0!!!r=   r?   Nc           
         dd l }|j        j                            t          j                  |j        _        |j        j                            t          j                  |j        _        | j        j        | j        j	        z  }|j
        j                            |j        |j        |j        |j        |j        |j        d|          |_        d S )Nr   T)w1_scale_invw2_scale_invr   r   is_mxfp4experts_start_id)intel_extension_for_pytorchr   r   r   ro   rl  r   r~   r[  num_local_expertsllmmodulesGatedMLPMOEr   r   r   r   ipex_fusion)ra   rt   ipexep_rank_starts       r>   r  z0IpexMxfp4MoEMethod.process_weights_after_loadingj  s    2222 % 0 5 : :5; G G$388EE/$/2SS H,88O/.^M* 9 	
 	
r=   c                     dS r3  r<   r4  s    r>   r8  z IpexMxfp4MoEMethod.is_monolithic{  r6  r=   r   rw  c           
         |j         dk    s
J d            t          | j        d          }t          j        j                            |d||                    d          z
  f          }|                    ||j	        |j
        ||j        |j        |j        d          }|dd | j        f                                         }|S )	N	swigluoaiz:Only swiglu_oai activation is supported for IPEX MXFP4 MoEr   r   r   
swiglu_oai)rB  .)rB  r(   r  ro   r   
functionalpadsizer  r`  r  r  ra  rb  r   )ra   rt   r   rw  hidden_size_padx_padr{  s          r>   r  z#IpexMxfp4MoEMethod.apply_monolithic  s     ;...H /.. #4#<cBB#''A/K+LMM))"K"# * 	
 	
 &c+FT-F+F&FGRRTTr=   )r2   r3   r4   r   r`   ro   r   r   r   r   r   r  r  r  r8  r
   r  r  r   r   s   @r>   r}   r}   R  s%       %> % % % % % %0x0 0 	0
 *-0 k0 0 0 0 0 0&
58? 
t 
 
 
 
" t    X < |	
 
       r=   r}   )Lenumr   typingr   ro   torch.nn.parameterr   vllmr   vllm.attention.layerr   vllm.configr   vllm.loggerr	   $vllm.model_executor.layers.fused_moer
   r   r   r   r*  +vllm.model_executor.layers.fused_moe.configr   r   r   r   5vllm.model_executor.layers.fused_moe.fused_marlin_moer   r   r   r  r   r   /vllm.model_executor.layers.fused_moe.trtllm_moer   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr   r   :vllm.model_executor.layers.quantization.utils.marlin_utilsr   >vllm.model_executor.layers.quantization.utils.marlin_utils_fp4r   9vllm.model_executor.layers.quantization.utils.mxfp4_utilsr   r    r!   9vllm.model_executor.layers.quantization.utils.quant_utilsr"   vllm.model_executor.utilsr#   vllm.platformsr$   vllm.scalar_typer%   rj  r&   vllm.utils.import_utilsr'   vllm.utils.math_utilsr(   vllm.utils.torch_utilsr)   r2   rI   r+   rL   r  rY   r[   r   r}   r<   r=   r>   <module>r     s                ( ( ( ( ( (       * * * * * * / / / / / / # # # # # #         
 F E E E E E                    
        M L L L L L Q Q Q Q Q Q Q Q G G G G G G                          
 W V V V V V 6 6 6 6 6 6 + + + + + + ) ) ) ) ) ) 0 0 0 0 0 0 6 6 6 6 6 6 * * * * * * : : : : : :	X		    4    \    2C C, C C C CL: : : : :$ : : :zbK bK bK bK bK' bK bK bKJC C C C C C C C C Cr=   