
    .`iz                        d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmc mc mc mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z m!Z!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z? d dl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQmRZRmSZSmTZTmUZUmVZVmWZW d dlXmYZYmZZZm[Z[ d dl\m]Z]m^Z^m_Z_ d dl`maZambZb d dlcmdZd d dlemfZf erd dlgmhZh dd gZi eej          Zk G d! d"e7          Zl G d# d$e          Zm G d% d&e2          Zn G d' d(en          Zo G d) d*e           Zp G d+ d,ep          Zq G d- d.e<          ZrdS )/    )TYPE_CHECKINGAnyOptionalN)Module)TorchDispatchMode)_custom_ops)rocm_aiter_ops)	Attention)$get_tensor_model_parallel_world_size)init_logger)vllm_is_batch_invariant)FusedMoEFusedMoEMethodBaseFusedMoEPermuteExpertsUnpermuteFusedMoEPrepareAndFinalizeFusedMoeWeightScaleSupported)FusedMoEQuantConfigRoutingMethodType)UnquantizedFusedMoEMethod)Fp8MoeBackend convert_to_fp8_moe_kernel_formatmake_fp8_moe_kernelmake_fp8_moe_kernel_for_mkmmake_fp8_moe_quant_configselect_fp8_moe_backend)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)init_fp8_linear_kernel)BaseKVCacheMethod)"apply_fi_trtllm_fp8_per_tensor_moe1build_flashinfer_fp8_cutlass_moe_prepare_finalize)
W8A8BlockFp8LinearOpcreate_fp8_input_scalecreate_fp8_scale_parametercreate_fp8_weight_parameter#maybe_post_process_fp8_weight_block%process_fp8_input_tensor_strategy_moe!process_fp8_weight_block_strategy"process_fp8_weight_tensor_strategy&process_fp8_weight_tensor_strategy_moevalidate_fp8_block_shape)get_marlin_input_dtype)apply_fp8_marlin_linearprepare_fp8_layer_for_marlin)
GroupShapeis_layer_skippedkFp8Dynamic128SymkFp8DynamicTensorSymkFp8DynamicTokenSymkFp8Static128BlockSymkFp8StaticTensorSym)cutlass_block_fp8_supportedcutlass_fp8_supportednormalize_e4m3fn_to_e4m3fnuz)BlockQuantScaleParameterModelWeightParameterPerTensorScaleParameter)replace_parameterset_weight_attrs)current_platform)is_deep_gemm_supported)WeightsMapperstaticdynamicc                       e Zd ZdZ	 	 	 	 ddededee         dz  dee         dz  d	df
 fd
Ze	d	e
fd            Ze	d	eej                 fd            Ze	d	efd            Ze	d	ee         fd            ZddZe	deeef         d	d fd            Zdej        j        ded	ed         fdZdej        j        ded	ed         fdZded	edz  fdZ xZS )	Fp8ConfigzConfig class for FP8.FrF   Nis_checkpoint_fp8_serializedactivation_schemeignored_layersweight_block_sizereturnc                 x   t                                                       || _        |t          vrt	          d|           || _        |pg | _        |]|st	          d          t          |          dk    r t	          dt          |           d          |dk    rt	          d| d          || _        d S )	NzUnsupported activation scheme zLThe block-wise quantization only supports fp8-serialized checkpoint for now.   zFThe quantization block size of weight must have 2 dimensions, but got z dimensionsrF   zUThe block-wise quantization only supports dynamic activation scheme for now, but got z activation scheme.)	super__init__rI   ACTIVATION_SCHEMES
ValueErrorrJ   rK   lenrL   )selfrI   rJ   rK   rL   	__class__s        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/fp8.pyrQ   zFp8Config.__init__q   s    	,H)$666Q>OQQRRR!2,2(/  *   $%%** O+./@+A+AO O O   !I-- >(> > >  
 "3    c                     dS )Nfp8 clss    rW   get_namezFp8Config.get_name   s    urX   c                 2    t           j        t           j        gS N)torchbfloat16halfr\   s    rW   get_supported_act_dtypesz"Fp8Config.get_supported_act_dtypes   s    
++rX   c                     dS )NK   r[   r\   s    rW   get_min_capabilityzFp8Config.get_min_capability   s    rrX   c                     g S r`   r[   r\   s    rW   get_config_filenameszFp8Config.get_config_filenames   s    	rX   hf_to_vllm_mapperrD   c                 V    | j         !|                    | j                   | _         d S d S r`   )rK   
apply_list)rU   rj   s     rW   apply_vllm_mapperzFp8Config.apply_vllm_mapper   s2    *"3">">t?R"S"SD +*rX   configc                    |                      |dg          }d|v }|                      |dg          }|                     |dgd           }|                     |dgd           }|s|                     |dgd           } | ||||          S )Nquant_methodrZ   rJ   rK   rL   modules_to_not_convertrI   rJ   rK   rL   )get_from_keysget_from_keys_or)r]   rn   rp   rI   rJ   rK   rL   s          rW   from_configzFp8Config.from_config   s    ((.1ABB','<$--f7J6KLL--f7G6H$OO00:M9NPTUU 	 1112D N s)E/)/	
 
 
 	
rX   layerprefixr!   c                    ddl m}m} t          | j        | j        | j        | j                  }t          |t                    r5t          || j        | j                  rt                      S  ||          S t          |t                    r<t          || j        | j                  rt          |j                  S  |||          S t          |t                     rt#          |           S d S )Nr   )XPUFp8LinearMethodXPUFp8MoEMethodrr   rw   rK   fused_mapping)2vllm.model_executor.layers.quantization.ipex_quantry   rz   rH   rI   rJ   rK   rL   
isinstancer   r4   packed_modules_mappingr   r   r   
moe_configr
   Fp8KVCacheMethod)rU   rv   rw   ry   rz   
fp8_configs         rW   get_xpu_quant_methodzFp8Config.get_xpu_quant_method   s4   	
 	
 	
 	
 	
 	
 	
 	

 )-)J"4."4	
 
 

 eZ(( 	*#2"9   1
 /000%%j111x(( 
	*#2"9   C
 11ABBB"?:u555y)) 	*#D)))trX   c                    t          j                    r|                     ||          S t          |t                    r{t          || j        | j                  rt                      S | j	        s%t          |           }t          |          |_        |S t          |           }t          |          |_        |S t          |t                    rZt          || j        | j                  rt          |j                  S | j	        rt#          | |          }nt%          | |          }|S t          |t&                    rt)          |           S d S )Nr{   )rB   is_xpur   r~   r   r4   rK   r   r   rI   Fp8OnlineLinearMethodr0   marlin_input_dtypeFp8LinearMethodr   r   r   Fp8MoEMethodFp8OnlineMoEMethodr
   r   )rU   rv   rw   online_methodoffline_methodmoe_quant_methods         rW   get_quant_methodzFp8Config.get_quant_method   sh    "$$ 	<,,UF;;;eZ(( 	*#2"9   1
 /0004 & 5d ; ;3I&3Q3Q0$$!0!6!64J64R4R1%%x(( 	*#2"9   C
 11ABBB0 C#/e#<#<  #5dE#B#B ##y)) 	*#D)))trX   namec                 v   |                     d          rd|v r|                    dd          S |                     d          rd|v r|                    dd          S |                     d          rd|v r|                    d	d
          S |                     d          r|                    dd          S dS )a%  
        Check whether the param name matches the format for k/v cache scales
        in compressed-tensors. If this is the case, return its equivalent
        param name expected by vLLM

        :param name: param name
        :return: matching param name for KV cache scale in vLLM
        z.output_scalez.k_projz.k_proj.output_scalez.attn.k_scalez.v_projz.v_proj.output_scalez.attn.v_scalez.q_projz.q_proj.output_scalez.attn.q_scalezself_attn.prob_output_scalez.prob_output_scalez.attn.prob_scaleN)endswithreplace)rU   r   s     rW   get_cache_scalezFp8Config.get_cache_scale  s     ==)) 	Ii4.?.?<< 6HHH==)) 	Ii4.?.?<< 6HHH==)) 	Ii4.?.?<< 6HHH==677 	J<< 46HIIItrX   )FrF   NN)rj   rD   )__name__
__module____qualname____doc__boolstrlistintrQ   classmethodr   r^   ra   dtyperd   rg   ri   rm   dictr   ru   nnr   r   r   r   r   __classcell__rV   s   @rW   rH   rH   n   s#        .3!*+/.2 3  3&* 3  3 S	D(	 3
  9t+ 3 
 3  3  3  3  3  3D ,    [ ,ek): , , , [, 3    [ T#Y    [T T T T 
c3h 
K 
 
 
 [
""X_".1"	&	'" " " "H"X_".1"	&	'" " " "HC C$J        rX   rH   c                   *     e Zd ZdZ fdZddZ xZS )CopyNumelCounterz
    Tracks total number of elements modified with `copy_`. Useful for keeping
    track of weight loading where underlying weights can be arbitrarily
    transformed (such as with `narrow`) before calling copy.
    c                 V    t                                                       d| _        d S Nr   )rP   rQ   copied_numel)rU   rV   s    rW   rQ   zCopyNumelCounter.__init__  s'    rX   r[   Nc                     |i } ||i |}|t           j        j        j        j        k    r(| xj        |d                                         z  c_        |S r   )ra   opsatencopy_defaultr   numel)rU   functypesargskwargsouts         rW   __torch_dispatch__z#CopyNumelCounter.__torch_dispatch__!  sZ    >FdD#F##59>'///a0
rX   )r[   N)r   r   r   r   rQ   r   r   r   s   @rW   r   r     sV                    rX   r   c                       e Zd ZdZdefdZdej        j        de	de
e	         de	de	d	ej        fd
ZdeddfdZ	 ddej        j        dej        dej        dz  dej        fdZdS )r   a  Linear method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Limitations:
    1. Only support float8_e4m3fn data type due to the limitation of
       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)

    Args:
        quant_config: The quantization config.
    quant_configc                 l   || _         t                      | _        t          j                    | _        d | _        t          j        d           pt          j	        | _
        t          j                    rd| _
        t                      rd| _
        t          j                    | _        t!                      | _        | j         j        | _        | j        d u| _        | j         j        dk    | _        | j        r[| j        rJ | j        J t-          t/          | j         t/          d| j        d                   | j        | j                  | _        d S | j        rt2          }nt5                      rt6          }nt8          }t;          |t2          t          j                    | j        j                  | _         d S )NY   FrE      r   )weight_group_shapeact_quant_group_shaper:   use_aiter_and_is_supported)activation_quant_keyweight_quant_key	out_dtypemodule_name)!r   r:   ra   get_default_dtyper   r   rB   has_device_capabilityenvsVLLM_TEST_FORCE_FP8_MARLIN
use_marlinis_rocmr   r	   is_linear_fp8_enabledr   rC   use_deep_gemmrL   block_quantrJ   act_q_staticr&   r3   w8a8_block_fp8_linearr9   r;   r7   r6   r"   rV   r   
fp8_linear)rU   r   r   s      rW   rQ   zFp8LinearMethod.__init__7  s   (+F+H+H(022 #' 6r::: /. 	
 #%% 	$#DO"$$ 	$#DO*8*N*P*P'355!%!2!D1= -?8K 	(((()555)=#-t/E#F&0D4J14M&N&N,0,L+/+J	* * *D&&&   <':$$&(( <':$$';$4%9!4133 N3	  DOOOrX   rv   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                    t          |          }|                    d          }	||_        ||_        ||_        ||_        d |_        | j        r.| j        J | j        |_        t          |||||| j                   t          |||	          }
|
                    d|
           | j        sAt          t          ||d |	          }t          |ddi           |
                    d|           nW| j        rJ | j        J t          t          ||| j        |	          }t          |ddi           |
                    d|           | j        r:t!          ||	          }t          |ddi           |
                    d|           d S d S )Nweight_loaderweight
scale_typeweight_scaleweight_scale_invinput_scale)sumgetlogical_widthsr   output_size_per_partition
orig_dtyperL   r   r/   r)   register_parameterr(   r?   rA   r   r=   r'   )rU   rv   r   r   r   r   r   extra_weight_attrsr   r   r   scales               rW   create_weightszFp8LinearMethod.create_weightsi  s    %((>$?$?!*..??5)A&*C''"& 
	)555&*&<E#$(&&   -%'?
 
 	  6222  	@.'&( E U\>$BCCC$$^U;;;;(((()555.(&(& E U\>$BCCC$$%7???  	;*+A=QQEU\=$ABBB$$]E:::::	; 	;rX   rM   Nc           
         d}d }| j         rU| j        rJ d}t          |j        |j                  \  }}t          |d|j                   t          |d|j                   n|j        }|j        }| j        sIt          |||j
        t          |dd                     \  }}}| j        r|J |                                }|                                }t          |d|j                   t          |d|j                   |t          |d|           nd |_        | j        rt          ||| j                   |`d S | j         rt#          |           d S d S )NTFr   r   r   r   input_dtype)r   r   r,   r   r   r@   datar   r   r-   r   getattrmaxtr   r2   r   r*   )rU   rv   size_k_firstr   r   r   r   s          rW   process_weights_after_loadingz-Fp8LinearMethod.process_weights_after_loading  s    #	H(((( L'He4( ($F$
 eXv{;;;e%79I9NOOOO \F -L ? 	44V (E=$77	5 51k $ 4&222"-//"3"3KXXZZF eXv{;;;e^\5FGGG"e]K@@@@ $E? 	(|1H    !F 	7/66666	7 	7rX   xbiasc           
         t                      r.| j        r7| j        J | j                            ||j        |j        |j        |          S |j                            t          j
                  }|j                            t          j
                  }|                                dk    r||z  }nR|                                dk    r5|j        d         |j        d         k    r||                    d          z  }n||z  }t          j        j                            ||                                |          S | j        rF| j        r|j        }n|j        }t+          ||j        ||j        |j        |j        | j        |          S | j        r7| j        J | j                            ||j        |j        |j        |          S | j                            |||          S )N)inputr   r   r   r   r   r   )r   r   r   	workspacesize_nsize_kr   r   )r   r   rL   r   applyr   r   r   tora   rb   r   r   dimshape	unsqueezer   
functionallinearr   r   r1   r   r   r   r   r   apply_weights)rU   rv   r   r   
weight_fp8r   weight_bf16s          rW   r   zFp8LinearMethod.apply  s
    #$$  	L L-999177 <!&!7 % 1 8    #\__U^<<
$144U^DD%%''1,,",|";KK %((**a//(.q1Z5Ea5HHH '1<3I3I!3L3L&L '1<&?x*11![]]__dKKK? 	 2$5$1*|)/65 3	 	 	 	  		)555-33|"3!- 4    ,,UAt<<<rX   r`   )r   r   r   r   rH   rQ   ra   r   r   r   r   r   r   r   Tensorr   r[   rX   rW   r   r   *  s       
 
0Y 0 0 0 0d@;x@; #&@; !%S		@;
 @; @; k@; @; @; @;D776 77d 77 77 77 77z %)	F= F=xF= <F= lT!	F=
 
F= F= F= F= F= F=rX   r   c                   l    e Zd ZdZdej        j        dedee         dededej	        fdZ
ded	d
fdZd
S )r   zoOnline version of Fp8LinearMethod, loads the fp16/bf16 checkpoint
    and quantized the weights during loading.rv   r   r   r   r   r   c                 $    t          |          }|                    d          |_        |_        |_        |_        d _         fd}	t          t          j	        |||          dd|	          }

                    d|
           d S )Nr   c                 N   t          d          sd_        t                      }|5   	| |g|R i |}d d d            n# 1 swxY w Y   xj        |j        z  c_        j                                        }j        |k    r                               `d_        |S N_loaded_numelr   T)hasattrr   r   r   r   r   r   -_already_called_process_weights_after_loading
paramloaded_weightr   r   copy_numel_counterrestarget_loaded_numelrv   rU   r   s
          rW   patched_weight_loaderzCFp8OnlineLinearMethod.create_weights.<locals>.patched_weight_loaderD  s"   5/22 (&'# "2!3!3# K K#mE=J4JJJ6JJK K K K K K K K K K K K K K K#5#BB #(,"4"4"6"6"&999225999 ' GKCJ   AA
Ar   r   r   )r   	input_dim
output_dimr   r   )r   r   r   r   r   r   rL   r>   ra   emptyr   )rU   rv   r   r   r   r   r   r   r   r  r   r   s   ``         @rW   r   z$Fp8OnlineLinearMethod.create_weights1  s     %((>$?$?!*..??5)A&*C''"&	 	 	 	 	 	 	2 &)("  
 /	
 	
 	
 	  622222rX   rM   Nc                 L   t          |dd          rd S | j        rJ d |_        t          j        |j        d           \  }}|                                }t          |d|j                   t          |d|j                   | j	        rd}t          ||| j                   d S d S )Nr  F)r   r   r   Tr   )r   r   r   r   scaled_fp8_quantr   r   r@   r   r   r2   r   )rU   rv   qweightr   r   r   s         rW   r   z3Fp8OnlineLinearMethod.process_weights_after_loadingi  s    5I5QQ 	F ####  # 4U\ N N N 	%6;777%1BCCC? 	L(|1H     	 	rX   )r   r   r   r   ra   r   r   r   r   r   r   r   r[   rX   rW   r   r   -  s        1 163x63 #&63 !%S		63
 63 63 k63 63 63 63p6 d      rX   r   c                       e Zd ZdZdedej        j        f fdZe	dej
        dz  fd            Zdeded	ed
edej
        f
dZdedej        dej        dej        dej        dej        dz  dej        dz  ddfdZdeddfdZ	 d#deej        ej        ej        f         dz  dej        dz  f fdZdedej        j        defdZdej        j        dedz  fdZe	defd            Ze	defd            Ze	defd            Zdedej        dej        dej        eej        ej        f         z  fdZdedej        d ej        d!ej        dej        eej        ej        f         z  f
d"Z xZ S )$r   au  MoE method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.

    Args:
        quant_config: The quantization config.
    r   rv   c                    t                                          |j                   || _        | j        j        | _        | j        d u| _        | j        rdnd| _        | j        rt          }t          }n%t          }| j        j
        dk    rt          nt          }t          | j        ||d          \  | _        | _        d | _        d S )Nr   r   rE   F)rn   
weight_keyactivation_keyallow_vllm_cutlass)rP   rQ   r   r   rL   r   weight_scale_namer8   r5   r9   rJ   r6   r   moefp8_backendexperts_clskernel)rU   r   rv   r  r  rV   s        rW   rQ   zFp8MoEMethod.__init__  s    )***(!%!2!D!%!7t!C"&"2F 	
  		.J.NN,J $6(BB $#)  .D8!)$	.
 .
 .
*$* 8<rX   rM   Nc                 P    | j         | j         j                                        S d S r`   )r  prepare_finalizetopk_indices_dtyperU   s    rW   r  zFp8MoEMethod.topk_indices_dtype  s&    ;";/BBDDDtrX   num_expertshidden_sizeintermediate_size_per_partitionr   c                    ||_         ||_        ||_        ||_        d |_        | j        j        sJ t          j        }| j	        r| j        J | j        |_        t                      }| j        d         | j        d         }	}||z  dk    rt          d| d| d          |dk    r||	z  dk    rt          d| d|	 d          t          j                            t          j        |d|z  ||	          d
          }
|                    d|
           t!          |
|           t          j                            t          j        ||||	          d
          }|                    d|           t!          ||           | j	        sBt          j        |dt          j        	          }t          j        |t          j        	          }nkt          j        |d||z   dz
  |z  z  ||	z   dz
  |	z  t          j        	          }t          j        |||z   dz
  |z  ||	z   dz
  |	z  t          j        	          }t          j                            |d
          }t          j                            |d
          }|                    d| j         |           |                    d| j         |           |                    | j	        rdt*          j        j        indt*          j        j        i           t!          ||           t!          ||           | j        j        dk    r| j	        rJ t          j                            t          j        |t          j        	          d
          }|                    d|           t!          ||           t          j                            t          j        |t          j        	          d
          }|                    d|           t!          ||           d S d |_        d |_        d S )Nr   r   z,The output_size of gate's and up's weight = z3 is not divisible by weight quantization block_n = .z"The input_size of down's weight = z3 is not divisible by weight quantization block_k = rO   r
  Frequires_grad
w13_weight	w2_weightw13_w2_rp   rE   w13_input_scalew2_input_scale)r!  r   r  r   rL   r   rI   ra   float8_e4m3fnr   r   rS   r   	Parameterr  r   rA   onesfloat32r  updater   BLOCKvalueTENSORrJ   r*  r+  )rU   rv   r  r   r!  r   r   tp_sizeblock_nblock_kr&  r'  w13_scale_dataw2_scale_dataw13_weight_scalew2_weight_scaler*  r+  s                     rW   r   zFp8MoEMethod.create_weights  s    1P-'''"& ====* 	)555&*&<E#:<<G&q)&q) G /8A== @6@ @5<@ @ @  
 {{>HAMM @6@ @5<@ @ @   X''K33"	     ( 
 

 	  z:::%7888H&&K/"	     ' 
 
	 	  i888$6777  	"ZQemLLLN!J{%-HHHMM #Z5?!COPw&*w6m	  N "Jw&*w607:Q>7Jm	  M !8--nE-RR(,,]%,PP  !@(>!@!@BRSSS  !?t'=!?!?QQQ 	!!M^9?EFF ">"E"KL	
 	
 	

 	)+=>>>*<=== .(::''''#h00
;em<<<E 1  O $$%6HHH_.@AAA"X//
;em<<<E 0  N $$%5~FFF^-?@@@@@ %)E!#'E   rX   w13w2	w13_scalew2_scaler*  r+  c           
         t          | j        |||||||          \  }}}}t          |d|           t          |d|           t          |d| j         |           t          |d| j         |           |                     |          | _        | j        ra| j        j        j        r| j        j        j	        rA| j
        J t          | j        | j        | j        | j
                  \  | _        | _        d S d S d S )N)r  rv   r;  r<  r=  r>  r*  r+  r&  r'  r(  r)  )moe_quant_configr   r  r  )r   r  r@   r  get_fused_moe_quant_configr@  r  moe_parallel_configuse_all2all_kernelsuse_naive_all2all_kernelsr  r   r  use_inplace)rU   rv   r;  r<  r=  r>  r*  r+  s           rW   _setup_kernelzFp8MoEMethod._setup_kernel.  sB    (H(+)	(
 	(
 	(
$RH 	%s333%b111%!@(>!@!@)LLL%!?t'=!?!?JJJ !% ? ? F F  
	-A
	x+E
	 #///,?!%!68 , ,	- - -)DK)))
	 
	 
	 
	rX   c           	      l   t          |dd          rd S |j        }|j        }t          |d| j                   }t          |d| j                   }|j        }|j        }t          j                    r*t          |||          \  }}}t          |||          \  }}}| j	        j
        dk    rD| j        rJ ||J t          ||          \  }}t          |d|           t          |d|           | j        s!|j        }t          ||||j                  \  }}|                     |||||||           d S )Nr  Fr(  r)  rE   r*  r+  )r   r&  r'  r  r*  r+  rB   is_fp8_fnuzr<   r   rJ   r   r+   r@   r!  r.   local_num_expertsrF  )	rU   rv   r;  r<  r=  r>  r*  r+  
shard_sizes	            rW   r   z*Fp8MoEMethod.process_weights_after_loading\  s   5I5QQ 	F _E#B$*@#B#BCC	5"@(>"@"@AA/- ')) 
	.J/ /+CO
 ,H, ,(B. .(::''''".>3M3MM.S/ /+O^ e%6HHHe%5~FFF  	>JCY
E,C NC
 	3Ix.	
 	
 	
 	
 	
rX   routing_tablesc                 H   | j         t          j        k    rd S | j         t          j        k    rU| j        j        j        sd S t          | j        | j                  }t          
                    d|j        j                   |S t                                          |          S )N)use_deepseek_fp8_block_scalez%s)r  r   FLASHINFER_TRTLLMFLASHINFER_CUTLASSr  rB  rC  r%   r   logger
debug_oncerV   r   rP   maybe_make_prepare_finalize)rU   rK  r  rV   s      rW   rR  z(Fp8MoEMethod.maybe_make_prepare_finalize  s     }>>>4!AAA8/C tP-1-=      d$4$>$GHHH##ww22>BBBrX   r  c                 j    | j         J | j        J t          | j        | j         | j        |          S )N)r   r   r  r  )r@  r  r   r  )rU   r  rv   s      rW   select_gemm_implzFp8MoEMethod.select_gemm_impl  sM    
 $000+++*x.(-	
 
 
 	
rX   c                     | j         t          j        k    rd S t          |d| j                   }t          |d| j                   }|j        }|j        }t          | j         ||||| j                  S )Nr(  r)  )r  w1_scaler>  a1_scalea2_scaleblock_shape)	r  r   rN  r   r  r*  r+  r   rL   )rU   rv   rV  r>  rW  rX  s         rW   rA  z'Fp8MoEMethod.get_fused_moe_quant_config  s     }>>>45"A)?"A"ABB5"@(>"@"@AA('((.
 
 
 	
rX   c                     dS NTr[   r  s    rW   supports_eplbzFp8MoEMethod.supports_eplb      trX   c                     dS r[  r[   r  s    rW   allow_inplacezFp8MoEMethod.allow_inplace  r]  rX   c                 ,    | j         t          j        k    S r`   )r  r   rN  r  s    rW   is_monolithiczFp8MoEMethod.is_monolithic  s    =#BBBrX   r   router_logitsc                    | j         sJ | j        t          j        k    sJ |j        rt          d          |j        dk    sJ d|j                     | j        rdd l}|j	        |j	        
                    |j                  nd }|j        }t          j        j                            |t"          j        k    r|
                    t          j                  n||||j        |j        |j        |j        |j        |j        |j        |j        |j        |j        |j        z  |j        | j        ||j                   S tC          ||||j	        |j        |j        |j        |j        |j"        	  	        S )Nz*EPLB not supported for `Fp8MoEMethod` yet.siluz#Expected 'silu' activation but got r   )routing_logitsrouting_biasr   r&  w13_weight_scale_invr'  w2_weight_scale_invglobal_num_expertstop_knum_expert_group
topk_groupintermediate_sizeexpert_offsetrI  rY  routing_method_typerouted_scaling)	rv   hidden_statesrb  rf  ri  rj  rk  rl  apply_router_weight_on_input)#ra  r  r   rN  enable_eplbNotImplementedError
activationr   :vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moee_score_correction_biasr   r   ro  ra   r   vllm#flashinfer_fused_moe_blockscale_fp8r   
DeepSeekV3r/  r&  rg  r'  rh  ri  rj  rk  rl  r!  ep_rankrI  rL   routed_scaling_factorr$   rr  )rU   rv   r   rb  rx  rw  ro  s          rW   apply_monolithiczFp8MoEMethod.apply_monolithic  s    !!!!=#BBBBB  	T%&RSSS6)))D%2BDD *))  )	MMMM 0< -00999 $
 #(";9>EE&*;*FFF  -//>>>"4 +%*%?/$)$=#(#;k!&!7 +"'"G#me.EE"'"9 2$7$:' F   , 6+":#(#;k!&!7 +-2-O
 
 
 
rX   topk_weightstopk_idsc                     | j         J | j        rJ |                      ||j        |j        ||| j        |j        |j        |j        |j        
  
        S )N)inplaceru  ri  
expert_maprr  )	r  ra  r&  r'  rE  ru  ri  r  rr  )rU   rv   r   r~  r  s        rW   r   zFp8MoEMethod.apply	  sn     {&&&%%%%{{O$'$7').)K  
 
 	
rX   r`   )!r   r   r   r   rH   ra   r   r   rQ   propertyr   r  r   r   r   rF  r   tuplemkr   rR  r   rT  r   rA  r   r\  r_  ra  r   r}  r   r   r   s   @rW   r   r     sf       
 
<Y <ux < < < < < <@ EK$$6    X
y(y( y( 	y(
 *-y( ky( y( y( y(v,, \, L	,
 <, ,, ,, t+, 
, , , ,\.
6 .
d .
 .
 .
 .
d RVC CelEL%,FG$NC 
	&	-C C C C C C&
4
 x
 
)	
 
 
 

X_
	t	#
 
 
 
* t    X t    X Ct C C C XC99 <9 |	9
 
elEL89	99 9 9 9v

 <
 l	

 ,
 
elEL89	9
 
 
 
 
 
 
 
rX   r   c            
       v     e Zd ZdZdedej        j        f fdZdede	de	de	dej
        f
d	Zded
dfdZ xZS )r   a  MoE method for online FP8 quantization.
    Supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.

    Args:
        quant_config: The quantization config.
    r   rv   c                     t                                          ||           |j        rJ |j        dk    sJ |j        J d S )NrF   )rP   rQ   rI   rJ   rL   )rU   r   rv   rV   s      rW   rQ   zFp8OnlineMoEMethod.__init__*  sU    u---<<<<-::::-55555rX   r  r   r!  r   c                     |_         |_        |_        |_        d _        |d         |} fd}||d<   |}t
          j                            t          j        |d|z  ||          d          }		                    d|	           t          |	|           t
          j                            t          j        ||||          d          }
	                    d|
           t          |
|           t
          j                            t          j        |t
          j                  d          }t
          j                            t          j        |t
          j                  d          }	                    d	|           	                    d
|           t          ||           t          ||           d _        d _        d S )Nr   c                    t          d          sd_        t                      }|5   	| |g|R i |}d d d            n# 1 swxY w Y   xj        |j        z  c_        j                                        j                                        z   }j        |k    r                               `d_        |S r   )	r   r   r   r   r&  r   r'  r   r  r  s
          rW   r  z@Fp8OnlineMoEMethod.create_weights.<locals>.patched_weight_loaderG  s6   5/22 (&'# "2!3!3# K K#mE=J4JJJ6JJK K K K K K K K K K K K K K K#5#BB #("2"8"8":":U_=R=R=T=T"T"&999225999 ' GKCJr	  rO   r
  Fr$  r&  r'  r9  r:  )r!  r   r  r   rL   ra   r   r-  r  r   rA   r.  r/  r*  r+  )rU   rv   r  r   r!  r   r   new_extra_weight_attrsr  r&  r'  r9  r:  r   s   ``           @rW   r   z!Fp8OnlineMoEMethod.create_weights0  s*    1P-'''"&
 +?; "4	 	 	 	 	 	 	2 3H/3 X''K33"	     ( 
 

 	  z:::%7888H&&K/"	     ' 
 
	 	  i888$6777
 !8--J{%-888 . 
 
  (,,J{%-888 - 
 
 	  !35EFFF  !2ODDD)+=>>>*<=== $#rX   rM   Nc           	      *   t          |dd          rd S t          j                    }t          j        |j        |          }t          j        |j        |          }|j        }|j        }t          |j
                  D ]n}t          j        |j        |d d d d f                   \  ||d d d d f<   ||<   t          j        |j        |d d d d f                   \  ||d d d d f<   ||<   o|                     ||||||j        |j                   d S )Nr  Fr
  )r   rB   	fp8_dtypera   
empty_liker&  r'  r9  r:  rangerI  r   r  rF  r*  r+  )rU   rv   r  r;  r<  r=  r>  experts           rW   r   z0Fp8OnlineMoEMethod.process_weights_after_loading  sX   5I5QQ 	F %.00	u/yAAAeoY???*	(E344 	 	F363G AAA.4 40C111y0 251E111-2 2.Bvqqq!!!|hv..
 	! 	
 	
 	
 	
 	
rX   )r   r   r   r   rH   ra   r   r   rQ   r   r   r   r   r   r   s   @rW   r   r      s         6Y 6ux 6 6 6 6 6 6[$[$ [$ 	[$
 *-[$ k[$ [$ [$ [$z
6 
d 
 
 
 
 
 
 
 
rX   r   c                   (     e Zd ZdZdef fdZ xZS )r   zI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    r   c                 J    t                                          |           d S r`   )rP   rQ   )rU   r   rV   s     rW   rQ   zFp8KVCacheMethod.__init__  s!    &&&&&rX   )r   r   r   r   rH   rQ   r   r   s   @rW   r   r     sN         'Y ' ' ' ' ' ' ' ' ' 'rX   r   )stypingr   r   r   ra   torch.nnr   torch.utils._python_dispatchr   	vllm.envsr   3vllm.model_executor.layers.fused_moe.modular_kernelmodel_executorlayers	fused_moemodular_kernelr  rx  r   r   vllm._aiter_opsr	   vllm.attention.layerr
   vllm.distributedr   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   $vllm.model_executor.layers.fused_moer   r   r   r   r   +vllm.model_executor.layers.fused_moe.configr   r   *vllm.model_executor.layers.fused_moe.layerr   /vllm.model_executor.layers.fused_moe.oracle.fp8r   r   r   r   r   r   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.quantization.base_configr    r!   9vllm.model_executor.layers.quantization.kernels.scaled_mmr"   0vllm.model_executor.layers.quantization.kv_cacher#   >vllm.model_executor.layers.quantization.utils.flashinfer_utilsr$   r%   7vllm.model_executor.layers.quantization.utils.fp8_utilsr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   :vllm.model_executor.layers.quantization.utils.marlin_utilsr0   >vllm.model_executor.layers.quantization.utils.marlin_utils_fp8r1   r2   9vllm.model_executor.layers.quantization.utils.quant_utilsr3   r4   r5   r6   r7   r8   r9   8vllm.model_executor.layers.quantization.utils.w8a8_utilsr:   r;   r<   vllm.model_executor.parameterr=   r>   r?   vllm.model_executor.utilsr@   rA   vllm.platformsrB   vllm.utils.deep_gemmrC    vllm.model_executor.models.utilsrD   rR   r   rP  rH   r   r   r   r   r   r   r[   rX   rW   <module>r     s   0 / / / / / / / / /        : : : : : :       @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ # # # # # # * * * * * * * * * * * * A A A A A A # # # # # #                          Q P P P P P                        
 H G G G G G             O N N N N N                                                                    
         
 J I I I I I I I + + + + + +       ?>>>>>>	* 	X		e e e e e" e e eP    (   (@= @= @= @= @=& @= @= @=FO O O O OO O O Of]
 ]
 ]
 ]
 ]
% ]
 ]
 ]
@I
 I
 I
 I
 I
 I
 I
 I
X' ' ' ' '( ' ' ' ' 'rX   