
    .`io                     l   d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6  ee7          Z8 G d de
          Z9 G d dej:                  Z; G d dej:                  Z< G d d ej:                  Z= G d! d"ej:                  Z>e G d# d$ej:                              Z? G d% d&ej:        e/e0          Z@dS )'z?Inference-only Flash model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group)init_logger)
SiluAndMul)FusedMoEZeroExpertFusedMoE)RMSNorm)MergedColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)block_dequant)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)DeepseekV2MLAAttention)IntermediateTensors   )SupportsLoRA
SupportsPP)PPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   z     e Zd ZdZdZdgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )FlashConfigzFlash model configuration.longcat_flashpast_key_values             N`      r         @   F{Gz?h㈵>T順 顆         bfloat16float32      ?r   c'                      t                      j        d||||||| |"|!|&d
|' || _        || _        || _        ||n|| _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        ||}|| _        || _        || _        || _        || _        |'                    dd           }(|(p|pddi}|'                    dd          })d|vr|)|d<   || _        || _        || _        || _        || _        |$| _        |%| _        |#| _        d| _        t=          | d          r| j        n|| _         t=          | d	          r| j!        | _!        d S t=          | d
          r| j"        | _!        d S | j         | _!        d S )N)
pad_token_idbos_token_ideos_token_idtie_word_embeddingsdtypeparams_dtyperouter_dtypetopk_methodrouter_biasnextn_use_scmoerope_scaling	rope_typedefault
rope_thetag    .Asiluffn_hidden_sizemoe_intermediate_sizeexpert_ffn_hidden_size )#super__init__
vocab_sizemax_position_embeddingshidden_sizenum_hidden_layersnum_attention_headsep_sizekv_lora_rankq_lora_rankqk_rope_head_dim
v_head_dimqk_nope_head_dimnum_experts_per_toknorm_topk_probnum_key_value_headsinitializer_rangerms_norm_epspretraining_tp	use_cachepoprope_parametersattention_biasattention_dropoutmla_scale_q_loramla_scale_kv_lorazero_expert_numzero_expert_typerouted_scaling_factor
hidden_acthasattrrI   intermediate_sizerJ   rK   )+selfrO   rQ   rl   
num_layersrR   rS   r\   rT   rU   rV   rW   rX   rY   rZ   r[   rP   r]   r^   r`   r:   r;   r<   r_   r=   rb   rc   rd   re   rf   r>   r?   r@   rB   rA   ri   rg   rh   rC   kwargsrD   rG   	__class__s+                                             |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/longcat_flash.pyrN   zFlashConfig.__init__U   s   T 	 	
%%% 3%%##+	
 	
 	
 	
 	
 %'>$&!2!>J 	 $7 (& 0$ 0#6 ,&"5#6 !2(,"zz.$77&U/Uk9=UZZi88
..,6OL).,!2 0!2. 0%:"  t.//#D  " 	
 4011 	@)-)CD&&&T344 	@)-)DD&&&)-)?D&&&    )&r(   r)   r*   r+   Nr,   r-   r   r.   r/   r0   r-   r-   NFr*   r1   r2   TNr3   r4   r   FNFr5   FFr6   r6   r7   FNr8   r   NF)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencerN   __classcell__rp   s   @rq   r%   r%   O   s        $$ J#4"5   $!!Oh@ h@ h@ h@ h@ h@ h@ h@ h@ h@rr   r%   c                   r     e Zd ZdZ	 	 	 ddededededz  d	ed
eddf fdZde	j
        de	j
        fdZ xZS )FlashMLPzFlash MLP layer.NT rQ   rl   rj   quant_configreduce_resultsprefixreturnc           	         t                                                       t          ||gdz  d|| d          | _        t	          ||d||| d          | _        |dk    rt          d| d	          t                      | _        d S )
N   F.gate_up_proj)biasr~   r   z
.down_proj)r   r~   r   r   rH   zUnsupported activation: z!. Only silu is supported for now.)	rM   rN   r   gate_up_projr   	down_proj
ValueErrorr   act_fn)rm   rQ   rl   rj   r~   r   r   rp   s          rq   rN   zFlashMLP.__init__   s     	6!#%+++
 
 
 +%)(((
 
 
 X:XXX   !llrr   xc                     |                                 dk    r|S |                     |          \  }}|                     |          }|                     |          \  }}|S )Nr   )numelr   r   r   )rm   r   gate_up_s       rq   forwardzFlashMLP.forward   sZ    7799>>H&&q))
KK  ~~a  1rr   )NTr}   )rs   rt   ru   rv   intstrr   boolrN   torchTensorr   ry   rz   s   @rq   r|   r|      s         37## ## # 	#
 )4/# # # 
# # # # # #> %,        rr   r|   c                   <     e Zd Zdej        dfdef fdZd Z xZS )LongcatRouterr   r}   r   c           	      t   t                                                       t          |d          r|j        n|j        d         | _        | j        |z   | _        t          |j        | j        |j        |d | d          | _        t          j
        t          j        | j        |                    | _        d S )Nn_routed_expertsr   z.classifier)r   r?   r~   r   )r>   )rM   rN   rk   r   num_expertsr   rQ   rB   
classifierr   	Parameterr   zerose_score_correction_bias)rm   configrg   rounter_params_dtyper   rp   s        rq   rN   zLongcatRouter.__init__   s     	 v122'F###A& 	
 !% 5 G*!#-)))
 
 
 (*|K.7KLLL(
 (
$$$rr   c                 6    |                      |          \  }}|S N)r   )rm   hidden_stateslogitsr   s       rq   r   zLongcatRouter.forward  s    OOM22	rr   )	rs   rt   ru   r   r6   r   rN   r   ry   rz   s   @rq   r   r      sh         "^
 

 
 
 
 
 
 
4      rr   r   c                        e Zd Z	 	 	 	 ddededededed	ej        dz  d
edz  dede	f fdZ
dej        dej        fdZ xZS )
LongcatMoeNr}   Fr   r   top_krQ   rl   r?   r~   r   enable_eplbc
                 |   t                                                       || _        || _        |j        dk    rt
          j        | _        t          ||j        | j        | d          | _	        |j        J |j
        J t          |j        |j
        | j	        ||||d|d|| d|	|j                  | _        d S )Nr7   z.gate)r   rg   r   r   TFz.experts)rg   rh   routerr   r   rQ   rl   r   r?   renormalizer~   r   r   ri   )rM   rN   rQ   r   r@   r   r7   r   rg   r   rh   r   ri   experts)rm   r   r   r   rQ   rl   r?   r~   r   r   rp   s             rq   rN   zLongcatMoe.__init__  s     	&$0!)++(-D%#"2!%!:###	
 
 
 %111&222)"2#4;##/%%&&&#"(">
 
 
rr   r   r   c                    |j         \  }}|                    d|          }| j        j        }||k     r.t          j        j                            |d||z
  fdd          }n|}|                     |	                    | j
                            }|                     ||          }||k    r|dd |f         }|                    ||          S )Nr   constantr5   )modevalue)r   router_logits.)shapeviewr   rQ   r   r   
functionalpadr   tor   )rm   r   
num_tokens
hidden_dimpadded_hiddenhidden_states_paddedrouter_logits_fullfinal_hidden_statess           rq   r   zLongcatMoe.forward:  s    !.!4
J%**2z:: 0%%#(8#6#:#:MJ./	 $; $ $   $1 ![[ ##D$=>>
 
 #ll., + 
 
 J&&"5c;J;6F"G"''
J???rr   NNr}   F)rs   rt   ru   r%   r   r   r>   r   r   r   rN   r   r   ry   rz   s   @rq   r   r     s         ,026!+
 +
+
 +
 	+

 +
 +
 kD(+
 )4/+
 +
 +
 +
 +
 +
 +
 +
Z @U\  @el  @  @  @  @  @  @  @  @rr   r   c                        e Zd ZdZ	 	 	 	 ddedededz  dedz  d	ed
e	ddf fdZ
dej        dej        dej        dz  deej        ej        f         fdZ xZS )FlashDecoderLayerz:Flash decoder layer with dual attention and MLP structure.Nr}   Fvllm_configr   cache_configr~   r   r   r   c           
      d    t                                                       t                              d          d                    _        j         _        t          dd          t          j         fdt          d          D                        _
        t          j        fdt          d          D                        _        t          j        fd	t          d          D                        _        t          j         fd
t          d          D                        _        t          t          d          rj        nj         j                 t          d          rj        nj        j        j         d           _        d S )N.)sepr   rP   r*   c                     g | ]m}t          j        j        j        j        j        t          d           rj        ndj        dt          dg           v rdn d|           nS )rV   N	self_attndisable_quant_modulez.self_attn.)r   r   rQ   	num_headsrY   rW   rX   rV   rU   rP   r   r~   r   )
r   rQ   rS   rY   rW   rX   rk   rV   rU   getattr)	.0ir   r   rP   r   r~   rm   r   s	     rq   
<listcomp>z.FlashDecoderLayer.__init__.<locals>.<listcomp>p  s       ( ' ' +! $ 0$8%+%<%+%<%0.5fm.L.LV**RV!'!4,C!-"gf6Lb&Q&QQQ "&%$4444#    rr   r   c                 F    g | ]}t          j        j                   S epsr   rQ   r^   r   r   r   s     rq   r   z.FlashDecoderLayer.__init__.<locals>.<listcomp>  ,    TTTaWV'V-@AAATTTrr   c                 F    g | ]}t          j        j                   S r   r   r   s     rq   r   z.FlashDecoderLayer.__init__.<locals>.<listcomp>  r   rr   c                     g | ]?}t          j        j        j        d t	          dg           v rdn d|           @S )mlpsr   Nz.mlps.)rQ   rl   rj   r~   r   )r|   rQ   rl   rj   r   )r   r   r   r   r~   rm   s     rq   r   z.FlashDecoderLayer.__init__.<locals>.<listcomp>  s          $ 0&,&>%01G!L!LLL "&%$//A//    rr   r   moe_topkz.mlp)r   r   r   rQ   rl   r~   r   )rM   rN   r   split	layer_idxrQ   r   r   
ModuleListranger   input_layernormpost_attention_layernormr   r   rk   r   r   r   rZ   rJ   mlp)	rm   r   r   r   r~   r   r   rP   rp   s	   `````` @rq   rN   zFlashDecoderLayer.__init__`  s    	V\\c\222677!-")&2KT"R"R          ( q)  
 
0  "}TTTT5QR88TTT 
  
 )+TTTT5QR88TTT)
 )
%
 M       q  
 
	 v1224//#DN3vz**,&//+*$:%OOO
 
 
rr   	positionsr   residualc                 (   ||} | j         d         |          }n | j         d         ||          \  }} | j        d         ||d           } | j        d         ||          \  }}|                                }|                     |          } | j        d         |          } | j         d         ||          \  }} | j        d         ||d           } | j        d         ||          \  }} | j        d         |          }||z   }||fS )Nr   )r   r   llama_4_scalingr   )r   r   r   cloner   r   )rm   r   r   r   hidden_states_copymoe_hidden_statess         rq   r   zFlashDecoderLayer.forward  si    $H3D03MBBMM&=d&:1&=mX&V&V#M8)q)' 
 
 
 #C$"?"B8#
 #
x
 +0022 HH%788 %	!]33"9$"6q"9-"R"Rx *q)' 
 
 

 #C$"?"B8#
 #
x
 %	!]33%(99h&&rr   r   )rs   rt   ru   rv   r
   r%   r	   r   r   r   rN   r   r   tupler   ry   rz   s   @rq   r   r   ]  s       DD ,026!J
 J
J
 J
 "D(	J

 )4/J
 J
 J
 
J
 J
 J
 J
 J
 J
X.'<.' |.' ,%	.'
 
u|U\)	*.' .' .' .' .' .' .' .'rr   r   c                        e Zd ZdZdddedef fdZdej        dej        fd	Z		 	 ddej        dej        de
d
z  dej        d
z  dej        e
z  f
dZ xZS )
FlashModelzFlash model.r}   r   r   r   c                   t                                                       t          d	i j        j        j        j        j        | _        t          dd           | _
        j        | _        t                      j        r0t          j        j        t!          |d                    | _        nt%                      | _        t'          j        fd| d          \  | _        | _        | _        t                      j        r!t3          j        j                  | _        nt%                      | _        t9          ddgj                  | _        d S )
Nr:   embed_tokensr   c                 ,    t          |           S )N)r   r~   r   )r   )r   r   r   r~   r   s    rq   <lambda>z%FlashModel.__init__.<locals>.<lambda>  s&    ,))   rr   z.layersr   r   r   rL   )rM   rN   r%   model_config	hf_config__dict__r   r~   r   r   padding_idxrO   r   is_first_rankr   rQ   r#   r   r   r"   rR   start_layer	end_layerlayersis_last_rankr   r^   normr!   make_empty_intermediate_tensors)rm   r   r   r   r   r~   rp   s    ` @@@rq   rN   zFlashModel.__init__  sx   KK{7AJKK"/"/"6>4@@ +>>' 	1 6!"#FN;;! ! !D !/ 0 0D8C$       %%%
9
 
9
 
9
5$.$+ >>& 	) 28KLLLDII&((DI/Vj)6+=0
 0
,,,rr   	input_idsr   c                 ,    |                      |          S r   )r   rm   r   s     rq   embed_input_idszFlashModel.embed_input_ids  s      +++rr   Nr   intermediate_tensorsinputs_embedsc                 p   t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        | j        | j                  D ]} ||||          \  }}t                      j        st          ||d          S | 	                    ||          \  }}|S )Nr   r   )r   r   )
r   r   r   r   r   r   r   r   r   r   )	rm   r   r   r   r   r   r   layerr   s	            rq   r   zFlashModel.forward	  s     >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7HDK)94>JJ 	 	E&+e' '#M88 ~~* 	&"/XFF    99]H==qrr   NN)rs   rt   ru   rv   r
   r   rN   r   r   r   r   r   ry   rz   s   @rq   r   r     s        AC #
 #
 #
z #
3 #
 #
 #
 #
 #
 #
J, ,%, , , , , <@-1 < < 2D8	
 |d* 
+	+       rr   r   c                   \    e Zd ZdZg dddgdZddded	ef fd
Zdej	        dej	        fdZ
	 	 ddej	        dej	        dedz  dej	        dz  dej	        ez  f
dZdej	        dej	        dz  fdZdeeeeeef                  fdZdeeeej	        f                  dee         fdZ xZS )LongcatFlashForCausalLMz)Flash model for causal language modeling.)q_projk_projv_proj	gate_projup_proj)qkv_projr   r}   r   r   r   c          	      8   t                                                       t          di |j        j        j        }|j        }|| _        t          |d          r|j	        n|j
        |_
        || _        t          |t          |d                    | _        t                      j        r1t!          |j        |j        |t          |d                    | _        nt)                      | _        t+          |j                  | _        | j        j        | _        d S )NrI   model)r   r   lm_head)r~   r   rL   )rM   rN   r%   r   r   r   r~   r   rk   rI   rl   r   r#   r  r   r   r   rO   rQ   r  r   r   logits_processorr   )rm   r   r   r   r~   rp   s        rq   rN   z LongcatFlashForCausalLM.__init__:  s   KK{7AJKK"/ v011*F"") 	  )#L,I,I
 
 

 >>& 	,)!")#FI66	  DLL *++DL /0A B BJ6 	,,,rr   r   r   c                 6    | j                             |          S r   )r  r   r   s     rq   r   z'LongcatFlashForCausalLM.embed_input_ids[  s    z)))444rr   Nr   r   r   c                 6    |                      ||||          }|S r   )r  )rm   r   r   r   r   r   s         rq   r   zLongcatFlashForCausalLM.forward^  s)     

y"6
 
 rr   r   c                 <    |                      | j        |          }|S r   )r  r  )rm   r   r   s      rq   compute_logitsz&LongcatFlashForCausalLM.compute_logitsj  s      &&t|]CCrr   c           
          t          j        | dddt          | j        d          r| j        j        n| j        j        d                   S )Nr  r   r  r   r   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   )r   make_expert_params_mappingrk   r   r   r   )rm   s    rq   get_expert_mappingz*LongcatFlashForCausalLM.get_expert_mappingq  sV     2 + +'t{$677,44(+
 
 
 	
rr   weightsc           
      N	   g d}|                                  }t                      }t          |                                           }|D ]\  }}d|v r|D ]\  }}	}
|	|vrd|v rd|vr|                    |	|          }|                    d          s|                    d          r||vrYd|v r^t          ||           ro||         }|j        } ||||
            nBd}|D ]}|\  }}	}}
|	|vrd	}|                    |	|          }d|v r+|                    d          s|                    d          r||vrZt          ||           rk||         }|j        }t          j	        t          d
t          f         |j                  } |||||
|d	          }|r|} n|ra|                    d          r||vr||                    d          r||vrd|v r|t          ||           r||         }t          |dt                    } |||           |                    |           t          | j        j                  D ]L}t          d          D ]8}t%          | j        j        |         t*                    r)| j        j        |         j        |         }t/          | j        d          r|j        j        j        t8          j        t8          j        fv rq| j        j        }|bt/          |j        d          sJ t9          j                     }tC          |j        j        |j        j"        |          #                    |          }n|j        j        }|$                    dd|j%        |j&        z   f          '                    |j%        |j&        gd          \  }}|(                    dd          )                                (                    dd          |_*        |)                                (                    dd          |_+        | j        j,        r4|j-        j        xj.        | j        j/        | j        j0        z  dz  z  c_.        | j        j1        r4|j2        j        xj.        | j        j/        | j        j3        z  dz  z  c_.        :N|S )N))fused_qkv_a_projq_a_projr   )r  kv_a_proj_with_mqar   )r   z
.gate_projr   )r   z.up_projr   zrotary_emb.inv_freqr   r   z.bias_biasz.mtp.FT.)shard_id	expert_idreturn_successz	.kv_scaleweight_loaderr   weight_block_sizeweight_scale_invr   r   r   )dimg      ?)4r  setdictnamed_parametersreplaceendswithr    r!  typingcastr   r   r   r   addr   r   rR   
isinstancer  r   r   r   rk   r~   	kv_b_projweightr>   r   float8_e4m3fnfloat8_e4m3fnuzr"  get_default_dtyper   r#  r   	unflattenrY   rX   r   	transpose
contiguousw_kcw_vcre   q_a_layernormdatarQ   rV   rf   kv_a_layernormrU   )rm   r  stacked_params_mappingexpert_params_mappingloaded_paramsparams_dictnameloaded_weight
param_nameweight_namer  paramr!  is_expert_weightmappingr  name_mappedsuccesslayer_idr   r   r"  r>   wr6  r7  s                             rq   load_weightsz$LongcatFlashForCausalLM.load_weights~  s   "
 "
 "
 !% 7 7 9 9"%%%4002233#* P	$ P	$D-$,,5K L8 L81
Kd**D==V4%7%7||K<< MM'**.2mmG.D.D+--d??*466 #D) % 3e]H===#( 4 68 68GCJ@JY"$.. '+$"&,,{J"G"GK+-- #,,W55!9D9M9Mg9V9V!k11 .tT:: ! '4E$)$7M$*K d+U-@% %M ,m%#!)"+'+  G  * ( ! !}}W-- !$k2I2I }}[11 !d+6M6M $ | .tT:: ! '-E$+0E% %M "M%777d####dk;<< #	 #	H1XX " "dj/9>JJ  J-h7A!D	%':  3)06');   )-(9(K%(4&y':<NOOOOO % 7 9 9)%/6%/@-  "U))	  "+2A[[I69MMN %3Y5IJPQ%RR d "&1!5!5!@!@!B!B!L!LQPQ!R!R	!%!2!2!<!<Q!B!B	;/ +277/$+2II< 77 ;0 ,388/$+2JJ= 88A"F rr   r  )rs   rt   ru   rv   packed_modules_mappingr
   r   rN   r   r   r   r   r   r  listr   r   r  r   r%  rJ  ry   rz   s   @rq   r  r  +  s       33
 
 
 

 
 BD 
 
 
z 
3 
 
 
 
 
 
B5 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
| 
	   
DsCc/A)B$C 
 
 
 
AHU33D-E$F A3s8 A A A A A A A Arr   r  )Arv   r*  collections.abcr   r   	itertoolsr   r   r   transformersr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   8vllm.model_executor.layers.quantization.utils.int8_utilsr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   &vllm.model_executor.models.deepseek_v2r   vllm.sequencer   
interfacesr   r   utilsr   r    r!   r"   r#   rs   loggerr%   Moduler|   r   r   r   r   r  rL   rr   rq   <module>rc     s  D F E  . . . . . . . .              ) ) ) ) ) ) = = = = = = / / / / / / / / ) ) ) ) ) ) # # # # # # < < < < < < M M M M M M M M 8 8 8 8 8 8         
 H G G G G G F F F F F F R R R R R R        P O O O O O I I I I I I - - - - - - 0 0 0 0 0 0 0 0              
X		n@ n@ n@ n@ n@" n@ n@ n@b) ) ) ) )ry ) ) )X    BI   @N@ N@ N@ N@ N@ N@ N@ N@b}' }' }' }' }'	 }' }' }'@ J J J J J J J JZT T T T Tbiz T T T T Trr   