
    .`i0              	       ^   d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z<m=Z= ddl>m?Z?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJ ddlKmLZLmMZM d d!lNmOZOmPZPmQZQmRZR d d"lSmTZTmUZUmVZVmWZWmXZX  eeY          ZZ G d# d$ej[                  Z\ G d% d&ej[                  Z] G d' d(ej[                  Z^dHd)e_d*e_d+e_fd,Z`d-ead.e_d/ejb        d+ejb        fd0Zc G d1 d2ej[                  Zd G d3 d4ej        j[        e"          Ze G d5 d6ej[                  Zf G d7 d8ej[                  Zg G d9 d:ej[                  Zhe G d; d<ej[                              Zi G d= d>eO          Zj G d? d@ej[        eRejeQeP          Zk G dA dBek          Zl G dC dDek          ZmdEe
ez  dFend+eadz  fdGZodS )Iz+Inference-only DeepseekV2/DeepseekV3 model.    N)CallableIterable)islice)nn)DeepseekV2ConfigDeepseekV3Config)rocm_aiter_ops)	Attention)support_torch_compile)CacheConfigParallelConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)init_logger)
SiluAndMul)AttentionLayerBase)SharedFusedMoE)	LayerNormRMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)
MLAModulesMultiHeadLatentAttentionWrapper)QuantizationConfig)per_token_group_quant_fp8)get_rope)SparseAttnIndexer)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)sequence_parallel_chunk)current_platform)IntermediateTensors)AttentionBackendDeepseekV32IndexerBackend)KVCacheSpecMLAAttentionSpec   )MixtureOfExpertsSupportsEagleSupportsLoRA
SupportsPP)PPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                        e Zd ZdZ	 	 	 	 ddedeez  deded	ed
edz  de	dz  de
ddf fdZdej        dej        dej        fdZ xZS )DeepseekAttentionz.Normal MHA implementation used by Deepseek v1.    N vllm_configconfighidden_size	num_headsmax_position_embeddingscache_configquant_configprefixreturnc	           
      D   t                                                       || _        t                      }
|| _        | j        |
z  dk    sJ | j        |
z  | _        |j        | _        | j        |
k    r| j        |
z  dk    sJ n|
| j        z  dk    sJ t          d| j        |
z            | _	        || j        z  | _
        | j        | j
        z  | _        | j	        | j
        z  | _        | j
        dz  | _        || _        t          || j
        | j        | j        d|          | _        t#          | j        | j
        z  |d|          | _        t'          | j
        ||j                  | _        t-          | j        | j
        | j        | j	        ||| d          | _        d S )	Nr   r3         F)biasrG   )max_positionrope_parameters.attnnum_kv_headsrF   rG   rH   )super__init__rC   r   total_num_headsrD   num_key_value_headstotal_num_kv_headsmaxrQ   head_dimq_sizekv_sizescalingrE   r   qkv_projr   o_projr%   rN   
rotary_embr
   attn)selfrA   rB   rC   rD   rE   rF   rG   rH   kwargstp_size	__class__s              z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/deepseek_v2.pyrS   zDeepseekAttention.__init__b   s    	&688(#g-2222-8"("<"g-- *W499999 T4499994#:g#EFF#t';;nt}4(4=8}d*'>$)M #%
 
 
 ( 4=0%	
 
 
 #M0"2
 
 

 NML*%%###
 
 
			    	positionshidden_statesc                 "   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )Ndim)r\   splitrY   rZ   r^   r_   r]   )
r`   rf   rg   qkv_qkvattn_outputoutputs
             rd   forwardzDeepseekAttention.forward   s    
 }--Q))T[$,E2)NN1ay!Q//1ii1a((KK,,	re   )r?   NNr@   )__name__
__module____qualname____doc__r   r   r   intr   r#   strrS   torchTensorrt   __classcell__rc   s   @rd   r>   r>   _   s        88 (,+/26?
 ?
?
 !#33?
 	?

 ?
 "%?
 "D(?
 )4/?
 ?
 
?
 ?
 ?
 ?
 ?
 ?
B
<
 |
 
	
 
 
 
 
 
 
 
re   r>   c                   R     e Zd Z	 	 	 	 ddededededz  d	ed
eddf fdZd Z xZ	S )DeepseekV2MLPNTFr@   rC   intermediate_size
hidden_actrG   reduce_resultsrH   rI   c           
         t                                                       t          ||gdz  d||| d          | _        t	          ||d|||| d          | _        |dk    rt          d| d	          t                      | _        d S )
N   Fz.gate_up_proj)rL   rG   
disable_tprH   z
.down_proj)rL   rG   r   r   rH   siluUnsupported activation: !. Only silu is supported for now.)	rR   rS   r   gate_up_projr   	down_proj
ValueErrorr   act_fn)	r`   rC   r   r   rG   r   is_sequence_parallelrH   rc   s	           rd   rS   zDeepseekV2MLP.__init__   s     	 7!#%++++
 
 
 +%)+(((
 
 
 X:XXX   !llre   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )r`   xgate_uprn   s       rd   rt   zDeepseekV2MLP.forward   sD    &&q))
KK  ~~a  1re   )NTFr@   )
ru   rv   rw   ry   rz   r#   boolrS   rt   r}   r~   s   @rd   r   r      s         37#"%# %#%# %# 	%#
 )4/%# %# %# 
%# %# %# %# %# %#N      re   r   c            	       f     e Zd Z	 	 ddeez  dededz  def fdZde	j
        d	e	j
        fd
Z xZS )DeepseekV2MoENr@   rB   parallel_configrG   rH   c           
         t                                                       t                      | _        t	                      | _        t          |dd          | _        t                      j	        | _
        t                      j        | _        | j
                                        | _        |j        | _        |j        | _        |j        | _        |j        dk    rt)          d|j         d          t+          |j        |j        dd | d          | _        t          |d	d           d
k    rBt1          j        t5          j        |j        t4          j                            | j        _        nd | j        _        |j        }|j        | _        |j         | _!        | j        | _"        | j"        | j!        z   | _#        | j#        | j        z  | _$        | j        | j$        z  | _%        | j%        | j$        z   | _&        tO          j(                    | _)        tO          j*                    | _*        |j        | j*        rd | _+        n<|j,        |j        z  }t[          |j        ||j        || j        d| d          | _+        t]          d&i d| j+        d| j        d|j        d|j/        d|j        d|j,        ddd|j0        d|dddt          |dd          dt          |dd          d| ddt          |dd           d| j)        sdn| j        d!| j        j        d"| j        d#| j!        d$| j        d%| j*        r|j        nd | _1        d S )'Nrouted_scaling_factor      ?r   r   r   Fz.gaterL   rG   rH   topk_methodnoaux_tc)dtypez.shared_experts)rC   r   r   rG   r   r   rH   shared_expertsgatenum_expertstop_krC   r   r   renormalizerG   use_grouped_topkTnum_expert_groupn_groupr3   
topk_grouprH   z.expertsscoring_funcsoftmaxe_score_correction_biasenable_eplbnum_redundant_expertsr   n_shared_experts )2rR   rS   r   rb   r   tp_rankgetattrr   r   device_groupep_grouprank_in_groupep_ranksizeep_sizen_routed_expertsr   use_sequence_parallel_moer   r   r   r   rC   r   r   	Parameterr{   emptyfloat32r   eplb_configr   r   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr	   is_fused_moe_enabledis_rocm_aiter_moe_enabled$is_fusion_moe_shared_experts_enabledr   moe_intermediate_sizer   r   num_experts_per_toknorm_topk_probexperts)r`   rB   r   rG   rH   r   r   rc   s          rd   rS   zDeepseekV2MoE.__init__   s    	;==577%,V5Lc%R%R"$3#~~3}))++%+%<%+%<$3$M!&&26+< 2 2 2  
 %####
 
 
	 6=$//:==02F35=III1 1DI-- 15DI- &1*6#.#D !%!6"&"84;S"S(,(?4<(O%%)\D4Q%Q"&)FF 	  *8)L)N)N&?AA 	1 "*d.W*"&D & <v?V V"/"."3!,)%)%>$ 111# # #D & 
 
 
..

 //
 ,,	

 **
 %::
 !5
 --
 &
 "T
 %VY:::
 v|Q777
 &&&&
 !CCC
$ 1#,##+'
( %)I$E$E)
* ((+
, #'":":-
. "&!:!:/
2 8V445
re   rg   rI   c                    |j         \  }}|                    d|          }| j        rt          |          }| j        j        r|                     ||          }n/|                     |          \  }}|                     ||          }|\  }}| j        |J |j        t          j
        k    r| j        s
|| j        z  }n| j        |J |d| j        z  z  }| j        	|J ||z  }| j        rt          |d          }|d |         }n%| j        dk    r| j                            |          }|                    ||          S )Nri   )rg   router_logitsr   r   r3   )shapeviewr   r+   r   is_internal_routerr   r   r   r{   float16r   r   r   rb   &maybe_all_reduce_tensor_model_parallel)	r`   rg   
num_tokens
hidden_dimfused_moe_outr   rn   shared_outputfinal_hidden_statess	            rd   rt   zDeepseekV2MoE.forwardG  s   !.!4
J%**2z:: $ 	C3MBBM<* 
	 LL+= )  MM
  $yy77M1 LL+= )  M .;**& ((( %-//1 B#t'AA# , ,,,S4#===M* ,,,=0$ 	"B#Q# # #6kzk"B\A"&,"U"U## # #''
J???re   )Nr@   )ru   rv   rw   r   r   r   r#   rz   rS   r{   r|   rt   r}   r~   s   @rd   r   r      s        
 37e
 e
 #33e
 (e
 )4/	e

 e
 e
 e
 e
 e
 e
N2@U\ 2@el 2@ 2@ 2@ 2@ 2@ 2@ 2@ 2@re   r   scalemscalerI   c                 V    dd l }| dk    rdS d|z  |                    |           z  dz   S )Nr   r3   r   g?)mathlog)r   r   r   s      rd   yarn_get_mscaler   |  s7    KKKzzs<$((5//)C//re    original_max_position_embeddingsscaling_betarf   c           	      v    d|t          j        dt          j        || z            z             z  z   }|d         S )Nr3   ).NN)r{   r   floor)r   r   rf   r[   s       rd   _get_llama_4_scalingr     sH     ,	EK	$DDEEE" "  G ?##re   c                        e Zd Z	 	 	 	 	 ddedeez  dededed	ed
edededededz  dedz  de	j
        dz  deddf fdZde	j
        de	j
        de	j
        dz  de	j
        fdZ xZS )DeepseekV2Attentionr?   Nr@   rA   rB   rC   rD   qk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rankrE   rF   rG   topk_indices_bufferrH   rI   c           
         t                                                       || _        || _        || _        ||z   | _        || _        || _        |	| _        || _	        t                      }||z  dk    sJ ||z  | _        | j        dz  | _        |
| _        |
J d            | j        pt          | j        | j        d|| d          | _        t!          | j        |j                  | _        t'          || j	        | j        z  d|| d          | _        n.t'          | j        | j	        | j        z  d|| d	          | _        t          | j        | j        | j        z   d|| d
          | _        t!          | j        |j                  | _        t'          | j        | j	        | j        | j        z   z  d|| d          | _        t3          | j	        | j        z  | j        d|| d          | _        |j        d         dk    r'|j                            dd          rdnd|j        d<   t;          ||
|j        d          | _        |j        d         dk    rh|j        d         dk    rW|j                            dd          }|j        d         }t?          |tA          |                    }| j        |z  |z  | _        tC          | j        | j        | j        | j        ||| d          | _"        d S )Nr   rK   zDtopk_indices_buffer is not         supported for DeepseekV2AttentionFz	.q_a_projr   eps	.q_b_proj.q_proj.kv_a_proj_with_mqa
.kv_b_proj.o_proj	rope_typedefaultapply_yarn_scalingTdeepseek_yarndeepseek_llama_scalingrM   rN   is_neox_stylemscale_all_dimfactorrO   rP   )#rR   rS   rC   r   r   qk_head_dimr   r   r   rD   r   num_local_headsr[   rE   r   q_a_projr   rms_norm_epsq_a_layernormr   q_b_projq_projkv_a_proj_with_mqakv_a_layernorm	kv_b_projr   r]   rN   getr%   r^   r   floatr
   r_   )r`   rA   rB   rC   rD   r   r   r   r   r   rE   rF   rG   r   rH   rb   r   scaling_factorr   rc   s                      rd   rS   zDeepseekV2Attention.__init__  s   " 	& 0 0+.>>$&("6887"a''''(G3'-'>$"**+ +**
 ',  ) +++  DM "))9v?R!S!S!SD0!11) +++  DMM / !11) )))  DK #3 55%111#
 #
 #
 &d&7V=PQQQ-Nd3doEF%(((
 
 
 (NT_,%%%%
 
 
 !+.);; )--.BDII.- ";/ #0"2	
 
 
 ";/9<<&{3FF#3778H%PPN#3H=N$^U>5J5JKKF<&069DL L-%%###
 
 
			re   rf   rg   llama_4_scalingc                    | j         k|                     |          d         }|                     |          }|                     |          d                             d| j        | j                  }n:|                     |          d                             d| j        | j                  }|                    | j	        | j
        gd          \  }}|                     |          d         }|                    | j        | j
        gd          \  }}	|                    d          }|                     |          }|                     |          d         }
|
                    d| j        | j	        | j        z             }
|
                    | j	        | j        gd          \  }}|d d d d | j        d f         }|                     |||          \  }}||d| j	        d f<   t%          j        |          }||dd | j	        f<   ||d| j	        d f<   |||z  }t$          j        j                            |d| j        | j        z
  gd                              d| j        | j        z            }|                     |||          }|                    d| j        | j                  dd | j        f                             d| j        | j        z            }|                     |          \  }}	|S )Nr   ri   rj   r3   .)value)r   r   r   r   r   r   r   r   rl   r   r   r   r   	unsqueezer  r  r   r^   r{   
empty_liker   
functionalpadr_   reshaper]   )r`   rf   rg   r  ro   q_nopeq_pelatent_cachekv_arn   kvk_noperq   k_perp   rr   rs   s                    rd   rt   zDeepseekV2Attention.forward  s    'm,,Q/A""1%%Aa  #((T-A4CSTTAAM**1-22D($*: A ww 5t7LMSUwVV..}==a@$$d&79N%OUW$XXa#--a00""4((^^D!!!$WWR-t/Dt/VWWHHd3T_E2HNN	AAAqqq$"3"5"556__Yd;;
d*.#t$&&
&'Q*0#&&&
&'*.#t$&&
&' & A H##4#do56a $ 
 

$r4'$*::
;
; 	
 ii1a((!&&r4+?AQRR"4?""

'"d*T_<
=
= 	 KK,,	re   )r?   NNNr@   )ru   rv   rw   r   r   r   ry   r   r#   r{   r|   rz   rS   rt   r}   r~   s   @rd   r   r     s\        (,+/2637s
 s
s
 !#33s
 	s

 s
 s
 s
 s
 s
 s
 "%s
 "D(s
 )4/s
 #\D0s
 s
  
!s
 s
 s
 s
 s
 s
j,<, |, ,	,
 
, , , , , , , ,re   r   c                   \     e Zd Zdedej        dedef fdZde	de
fdZd	 Zdefd
Z xZS )DeepseekV32IndexerCacherX   r   rH   rF   c                 $   t                                                       t          j        g           g| _        || _        || _        || _        || _        t                      j
        }||j        v rt          d|           | |j        |<   d S )NzDuplicate layer name: )rR   rS   r{   tensorkv_cacherX   rH   rF   r   r   compilation_configstatic_forward_contextr   )r`   rX   r   rH   rF   r  rc   s         rd   rS   z DeepseekV32IndexerCache.__init__4  s     	b))* (
466I'>>>>f>>???<@1&999re   rA   rI   c                 P    t          | j        j        d| j        | j                  S )Nr3   )
block_sizerQ   	head_sizer   )r2   rF   r  rX   r   )r`   rA   s     rd   get_kv_cache_specz)DeepseekV32IndexerCache.get_kv_cache_specB  s/    (3m*	
 
 
 	
re   c                     d S r   r   r`   s    rd   rt   zDeepseekV32IndexerCache.forwardJ  s      re   c                     t           S r   r/   r!  s    rd   get_attn_backendz(DeepseekV32IndexerCache.get_attn_backendL  s    ((re   )ru   rv   rw   ry   r{   r   rz   r   rS   r   r1   r  rt   r.   r#  r}   r~   s   @rd   r  r  3  s        AA$)KA9<ALWA A A A A A
Z 
K 
 
 
 
 )"2 ) ) ) ) ) ) ) )re   r  c                        e Zd Z	 ddedeez  dedededz  dedz  d	e	j
        dz  d
ef fdZde	j
        de	j
        de	j
        fdZ xZS )Indexerr@   rA   rB   rC   r   rG   NrF   r   rH   c	           
         t                                                       || _        || _        |j        | _        |j        | _        |j        | _	        |j
        | _        || _        t          | j        | j	        | j        z  d|| d          | _        t          || j	        d|| d          | _        t!          | j	        d          | _        t          || j        dd | d          | _        | j	        dz  | _        d	| _        d
| _        || _        t/          | j	        | j	        | j        z  dz  z   t0          j        | d|          | _        |j        j        | _        || _        ddlm}	  |	|          | _         tC          | j        | j        | j        | j        | j	        | j        | j         | j                  | _"        d S )NFz.wq_br   z.wkgư>r   z.weights_projrK   ue8m0      z.k_cache)rX   r   rH   rF   r   )get_max_prefill_buffer_size)#rR   rS   rA   rB   
index_topktopk_tokensindex_n_headsn_headindex_head_dimrX   r   rope_dimr   r   wq_bwkr   k_normweights_projsoftmax_scale	scale_fmtquant_block_sizer   r  r{   uint8k_cachemodel_configmax_model_lenrH   &vllm.v1.attention.backends.mla.indexerr*  max_total_seq_lenr&   
indexer_op)r`   rA   rB   rC   r   rG   rF   r   rH   r*  rc   s             rd   rS   zIndexer.__init__Q  s    	&!,*-/&$MDK'%###
 
 
	 #M%>>>
 
 
  4888,K+++
 
 
 "]D0  ##6 
 /]T]d6K%Ka%OO+&&&%	
 
 
 )5CVVVVVV!<!<[!I!I+L!NM"$	
 	
re   rg   qrrI   c                    |                      |          \  }}|                    d| j        | j                  }t	          j        || j        | j        | j        z
  gd          \  }}|                     |          \  }	}|                     |	          }	t	          j        |	| j        | j        | j        z
  gd          \  }
} ||||
	                    d                    \  }}
|
                    d| j        | j                  }|

                    dd| j                  }
t	          j        ||gd          }t	          j        |
                    d          |gd          }	|                    d| j                  }t          || j        d| j        d u          \  }}|                    d| j        | j                  }|                    d| j        d          }|                     |          \  }}|	                    d          |z  | j        z  | j        dz  z  }|                    d          }|                     |||	|          S )Nri   rj   r3   F)column_major_scales	use_ue8m0rK   )r1  r   r.  rX   r{   rl   r0  r2  r3  r	  r  catsqueezer$   r7  r6  r4  r5  r>  )r`   rg   r?  rf   r^   ro   rn   r  r  rp   r  r  q_fp8q_scaleweightss                  rd   rt   zIndexer.forward  s7    yy}}1FF2t{DM22{t}t}<=2
 
 
f ww}%%1KKNN{t}t}<=2
 
 
f  Z	41B1BCC
d ||BT];;||B4=11 ItVn"---It||B''0b999 FF2t}%%2! %nD0	
 
 
w 

2t{DM::,,r4;22&&}55
b!!G+d.@@4;PTCTT 	 //"%%}eQ@@@re   )r@   )ru   rv   rw   r   r   r   ry   r#   r   r{   r|   rz   rS   rt   r}   r~   s   @rd   r%  r%  P  s         H
 H
H
 !#33H
 	H

 H
 )4/H
 "D(H
 #\D0H
 H
 H
 H
 H
 H
 H
T,A"\,A/4|,A	,A ,A ,A ,A ,A ,A ,A ,Are   r%  c                        e Zd ZdZ	 	 	 	 	 ddedeez  deded	ed
edededz  dedededz  de	dz  de
dej        dz  ddf fdZdej        dej        dej        dz  dej        fdZ xZS )DeepseekV2MLAAttentiona  
    Main reference: DeepseekV2 paper, and FlashInfer Implementation
    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).

        For more info see MLACommonImpl in:
        vllm/v1/attention/backends/mla/utils.py
    r?   Nr@   rA   rB   rC   rD   r   r   r   r   r   rE   rF   rG   rH   r   rI   c                    t                                                       || _        || _        || _        ||z   | _        || _        || _        |	| _        || _	        t                      }||z  dk    sJ ||z  | _        | j        dz  | _        |
| _        | j        7t          | j        | j        | j        | j        z   gd|| dd          | _        n.t!          | j        | j        | j        z   d|| d          | _        | j        Ot%          | j        |j        	          | _        t+          | j        | j	        | j        z  d|| d
          | _        n.t+          | j        | j	        | j        z  d|| d          | _        t%          | j        |j        	          | _        t+          | j        | j	        | j        | j        z   z  d|| d          | _        t5          | j	        | j        z  | j        d|| d          | _        |j        d         dk    r'|j                            dd          rdnd|j        d<   t=          ||
|j        d          | _        |j        d         dk    rh|j        d         dk    rW|j                            dd          }|j        d         }tA          |tC          |                    }| j        |z  |z  | _        tE          |d          | _#        | j#        r<t=          ||
|j        d          | _$        tK          |||||||| d          | _&        nd | _$        d | _&        tO          | j        | j        | j        | j        | j        | j        nd | j        | j        nd | j        | j        nd | j        | j        nd | j        | j        nd | j&        | j$        | j#        |          }tQ          | j        | j        | j        | j        | j        | j        | j        | j        ||||          | _)        d S )Nr   rK   Fz.fused_qkv_a_projT)rL   rG   rH   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+  z.indexer)r  r  r^   r]   fused_qkv_a_projr   r   r   r   indexerindexer_rotary_emb	is_sparser   )*rR   rS   rC   r   r   r   r   r   r   rD   r   r   r[   rE   r   rL  r   r   r   r   r   r   r   r   r  r  r   r]   rN   r  r%   r^   r   r  hasattris_v32indexer_rope_embr%  rM  r!   r"   mla_attn)r`   rA   rB   rC   rD   r   r   r   r   r   rE   rF   rG   rH   r   rb   r   r  r   mla_modulesrc   s                       rd   rS   zDeepseekV2MLAAttention.__init__  s   " 	& 0 0+.>>$&("6887"a''''(G3'-'>$'$> !4#4t7L#LM) 333% % %D!! '7 !D$99) 555' ' 'D# '!()9v?R!S!S!SD0 !11) +++  DMM / !11) )))  DK &d&7V=PQQQ-Nd3doEF%(((
 
 
 (NT_,%%%%
 
 
 !+.);; )--.BDII.- ";/ #0"2	
 
 
 ";/9<<&{3FF#3778H%PPN#3H=N$^U>5J5JKKF<&069DLfl33; 	 $, 4 & 6"	% % %D! #####	 	DLL %)D!DL .n;+ "22'  $66040@0L$,,RV&*&6&BT]]"&"2":4;;L#4k 3#
 
 
( 8 L!!O
 
re   rf   rg   r  c                 0    |                      |||          S r   )rS  )r`   rf   rg   r  s       rd   rt   zDeepseekV2MLAAttention.forwardz  s     }}YGGGre   )r?   NNr@   N)ru   rv   rw   rx   r   r   r   ry   r   r#   rz   r{   r|   rS   rt   r}   r~   s   @rd   rJ  rJ    sv        & (,+/2637e
 e
e
 !#33e
 	e

 e
 e
 e
 e
 4Ze
 e
 "%e
 "D(e
 )4/e
 e
 #\D0e
  
!e
 e
 e
 e
 e
 e
NH<H |H ,	H
 
H H H H H H H Hre   rJ  c                        e Zd Z	 	 ddedededz  dej        dz  ddf
 fdZ	 ddej        d	ej        d
ej        dz  dej        dz  dej        f
dZ	 xZ
S )DeepseekV2DecoderLayerNrA   rH   rB   r   rI   c                 n   t                                                       ||j        j        }|j        }|j        }|j        }|j        }|j        | _        t          |dd          }	t          |dd          }
t          |
                    d          d                   }|| _        t          |dd	          }t          |d
d	          }t          |dd	          }t          |dd	          }|j        dk    pt          d ||fD                       }|| _        |rt          }n|j        rt"          }nt$          } |||| j        |j        |||t)          |d          r|j        nd ||	||| d|          | _        |j        0||j        k    r%||
z  d	k    rt3          |||| d          | _        n+t7          |j        |j        |j        || d          | _        t=          |j        |j                  | _         t=          |j        |j                  | _!        t          |dd          | _"        d S )NrE   r?   moe_layer_freqr3   .)sepri   r   r   r   r   r   deepseekc              3   "   K   | ]
}|d k    V  dS r   Nr   .0rk   s     rd   	<genexpr>z2DeepseekV2DecoderLayer.__init__.<locals>.<genexpr>  s7       9
 9
C1H9
 9
 9
 9
 9
 9
re   r   z
.self_attn)rA   rB   rC   rD   r   r   r   r   r   rE   rF   rG   rH   r   z.mlp)rB   r   rG   rH   )rC   r   r   rG   rH   r   r   r   )#rR   rS   r:  	hf_configrF   rG   r   rC   r   ry   rl   	layer_idx
model_typealluse_mhar>   use_mlarJ  r   num_attention_headsrP  r   	self_attnr   first_k_dense_replacer   mlpr   r   r   r   r   input_layernormpost_attention_layernormr   )r`   rA   rH   rB   r   r:  rF   rG   r   rE   rY  rc  r   r   r   r   rf  attn_clsrc   s                     rd   rS   zDeepseekV2DecoderLayer.__init__  s    	> -7F"/"/"/%5!-")&2KT"R"R )91== --b122	" #6+=qAA"6+=qAAV\155
v~q99#z1 
S 9
 9
!13C D9
 9
 9
 6
 6
  	+(HH! 	+-HH*H!#(0--!.5fm.L.LV**RV%$;%%((( 3
 
 
$ #/V999N*a//$ /) 	  DHH %"."(":!,)   DH  'v'9v?RSSS(/F$7)
 )
 )
% &-V5Lc%R%R"""re   rf   rg   residualr  c                 L   |*|                                 }|                     |          }n|                     ||          \  }}||d}| j        s||d<    | j        di |}t	          | j        t
                    s:|j        t          j        k    r%|d| j	        z  z  }| j
        dk    r|d| j	        z  z  }|                     ||          \  }}|                     |          }t	          | j        t                    r"|j        t          j        k    r|d| j	        z  z  }||fS )N)rf   rg   r  r   r   r   )clonerl  rf  ri  
isinstancer>   r   r{   r   r   rc  rm  rk  r   )r`   rf   rg   ro  r  attn_kwargss         rd   rt   zDeepseekV2DecoderLayer.forward  sW    $**,,H 00??MM&*&:&:=(&S&S#M8 #*
 
 | 	=-<K)*&5555 4>+<==	=#u}44
 S4#===M~"" C$"<<< #'"?"?x"X"Xx//dh.. 	>=3F%-3W3W S4#===Mh&&re   NNr   )ru   rv   rw   r   rz   r   r{   r|   rS   rt   r}   r~   s   @rd   rW  rW    s       
 +/37QS QSQS QS !4'	QS
 #\D0QS 
QS QS QS QS QS QSp 04/' /'</' |/' ,%	/'
 ,/' 
/' /' /' /' /' /' /' /'re   rW  c                        e Zd ZdZdddedef fdZdej        dej        fd	Z		 ddej        dej        de
d
z  dej        d
z  dej        e
z  f
dZ xZS )DeepseekV2ModelFr@   rH   rA   rH   c                (   t                                                       j        j        }j        }|| _        t          j        | _        |j	        | _	        t          |d          | _        | j        r9|j        }t          j        j        j        |t          j        | j                  nd t%                      j        r&t)          |j	        |j        || d          | _        nt/                      | _        t1          |j        fd| d          \  | _        | _        | _        t%                      j        r!t=          |j        |j                  | _         nt/                      | _         tC          d	d
g|j                  | _"        d S )Nr+  )r   devicez.embed_tokensrG   rH   c                 (    t          |           S )N)r   )rW  )rH   r   rA   s    rd   <lambda>z*DeepseekV2Model.__init__.<locals>.<lambda>-  s    1V9L   re   z.layersrw  r   rg   ro  )#rR   rS   r:  rb  rG   rB   r,   device_typery  
vocab_sizerP  rQ  r+  r{   r   scheduler_configmax_num_batched_tokensint32r   is_first_rankr(   rC   embed_tokensr8   r;   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   r   normr:   make_empty_intermediate_tensors)r`   rA   rH   rB   rG   r,  r   rc   s    `    @rd   rS   zDeepseekV2Model.__init__  s   )3"/&2 +fl33; 		' +K"'+,Ck{	# # # #'>>' 	1 6!") ///	! ! !D !/ 0 0D8C$     %%%9
 9
 9
5$.$+ >>& 	) 28KLLLDII&((DI/Vj)6+=0
 0
,,,re   	input_idsrI   c                 ,    |                      |          S r   )r  r`   r  s     rd   embed_input_idszDeepseekV2Model.embed_input_ids;  s      +++re   Nrf   intermediate_tensorsinputs_embedsc                    t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        dd           }|t          |d         |d         |          }nd }t          | j        | j        | j	                  D ]}	 |	||||          \  }}t                      j
        st          ||d          S |                     ||          \  }}
|S )Nrg   ro  r  r   beta)r   r   rf   )rg   ro  )r   r  r  r   rB   r   r   r  r  r  r  r-   r  )r`   r  rf   r  r  rg   ro  llama_4_scaling_configr  layerrn   s              rd   rt   zDeepseekV2Model.forward>  s=    >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7H ")6G!N!N!-21G62 4F;#  OO #ODK)94>JJ 	 	E&+e=(O' '#M88 ~~* 	&"/XFF    99]H==qre   r   )ru   rv   rw   fall_back_to_pt_during_loadr   rz   rS   r{   r|   r  r-   rt   r}   r~   s   @rd   rv  rv  	  s        "'AC ,
 ,
 ,
z ,
3 ,
 ,
 ,
 ,
 ,
 ,
\, ,%, , , , , .2+ +<+ <+ 2D8	+
 |d*+ 
+	++ + + + + + + +re   rv  c                   N    e Zd ZU ee         ed<   	 dedz  fdZdededdfdZdS )	DeepseekV2MixtureOfExpertsmoe_mlp_layersexample_moeNc                 B   |Td| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        t          	                    d           d S |j
        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        d S )Nr   z9DeepSeekV2: No DeepseekV2MoE layer found in model.layers.)num_moe_layersnum_expert_groupsnum_logical_expertsnum_physical_expertsnum_local_physical_expertsnum_routed_expertsnum_shared_expertsr   loggerwarningr   r   r   r   r   r   )r`   r  s     rd   extract_moe_parametersz1DeepseekV2MixtureOfExperts.extract_moe_parametersr  s    "#D%&D"'(D$()D%./D+&'D#&'D#)*D&NNVWWWWW'2'DD$(3(FD%.9.RD+&1&BD#&1&BD#)4)HD&&&re   r  r  rI   c                     | j         |k    sJ || _        || _         || j        z
  | _        | j        D ]5}||_        ||_        | j        |_        |j        	                                 6d S r   )
r  r  r  r   r  r   r   r   r   update_expert_map)r`   r  r  moes       rd    update_physical_experts_metadataz;DeepseekV2MixtureOfExperts.update_physical_experts_metadata  s    
 .2LLLLL$8!*D'%9D<T%T"& 	, 	,C+EC(%9C"&*&@C#K))++++		, 	,re   )	ru   rv   rw   listr   __annotations__r  ry   r  r   re   rd   r  r  l  s         ''''I-$2F I I I I&,!, %(, 
	, , , , , ,re   r  c                   \    e Zd ZdddgiZeZdddedef fdZd	 Z	d
e
j        de
j        fdZ	 	 dd
e
j        de
j        dedz  de
j        dz  de
j        ez  f
dZde
j        de
j        dz  fdZdeeeeeef                  fdZdeeee
j        f                  dee         fdZ xZS )DeepseekV2ForCausalLMr   	gate_projup_projr@   rw  rA   rH   c          	      f   t                                                       |j        j        }|j        }|| _        || _        t          |dd          }t          |dd          }|j        dk    pt          d ||fD                       | _	        | j	        rg d| j
        d<   t          |d          o|j        d u| _        | j        rd	d
g| j
        d<   |                     |t          |d                    | _        t#                      j        r1t'          |j        |j        |t          |d                    | _        nt/                      | _        t1          |j                  | _        | j        j        | _        | j        j        | j        j        z
  | _        |                                  d S )Nr   r   r   r\  c              3   "   K   | ]
}|d k    V  dS r^  r   r_  s     rd   ra  z1DeepseekV2ForCausalLM.__init__.<locals>.<genexpr>  s7       >
 >
C1H>
 >
 >
 >
 >
 >
re   )r   k_projv_projr\   r   r   r   rL  model)rA   rH   lm_headrz  )rR   rS   r:  rb  rG   rB   r   rd  re  rf  packed_modules_mappingrP  r   fuse_qkv_a_proj	model_clsr<   r  r   r  r'   r~  rC   r  r8   r    logits_processorr  r  rj  r  set_moe_parameters)r`   rA   rH   rB   rG   r   r   rc   s          rd   rS   zDeepseekV2ForCausalLM.__init__  s   )3"/("6+=qAA"6+=qAA(J6 
# >
 >
!13C D>
 >
 >
 ;
 ;
 < 	U6T6T6TD'
3 FM**Mv/A/M 	  	$?D'(:;
 ^^#L,I,I $ 
 

 >>& 	,)!")#FI66	  DLL *++DL /0A B BJ6 	,
 K)DK,MM 	 	!!!!!re   c                    g | _         t          | j        dd          | _        g | _        g | _        d }| j        j        D ]}t          |t                    rt          |t                    sJ t          |j        t                    rJ|j        }| j                            |j                   | j                            |j        j                   |                     |           d S )Nr   r3   )expert_weightsr   rB   r  
moe_layersr  r  r  rr  r8   rW  rk  r   appendr   r  )r`   r  r  s      rd   r  z(DeepseekV2ForCausalLM.set_moe_parameters  s     !(i!C!C Z& 		: 		:E%00 e%;<<<<<%)]33 :#i#**59555&&uy'8999##K00000re   r  rI   c                 6    | j                             |          S r   )r  r  r  s     rd   r  z%DeepseekV2ForCausalLM.embed_input_ids  s    z)))444re   Nrf   r  r  c                 6    |                      ||||          }|S r   )r  )r`   r  rf   r  r  rg   s         rd   rt   zDeepseekV2ForCausalLM.forward  s)     

y"6
 
 re   rg   c                 <    |                      | j        |          }|S r   )r  r  )r`   rg   logitss      rd   compute_logitsz$DeepseekV2ForCausalLM.compute_logits  s      &&t|]CCre   c                 J    t          j        | ddd| j        j        d          S )Nr  r   r  r   ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   r   )r   make_expert_params_mappingrB   r   r!  s    rd   get_expert_mappingz(DeepseekV2ForCausalLM.get_expert_mapping  s5     8 + +'4"#
 
 
 	
re   rH  c                    t          j                    }ddg}ddg}g d}| j        r|                    |           n|                    |           t	          j        | ddd| j        j        |r| j        j        nd	z   | j	        
          }t          |                                           }t                      }|D ]\  }	}
d|	v rt          | j        |	          }|#|od|	v }|D ]\  }}}||	vrd|	v r|	|vr|r|	                    ||          }|dk    r||vr8|}	|	                    d          r|	|vrTt!          |	|           re||	         }|j        } |||
|            nd}d}|rVt%          | j        dd          pd}d|	v r|
j        dk    rdnd	}|
j        |         }||z  d	k    sJ d| d|             ||z  }t+          |          D ]}|	}|
}|rrt-          ||z  |dz   |z            }|
j        dk    r	|
|         }n|d	k    r|
|d d f         }n|
d d |f         }|	                    dd| j        j        |z              }|D ]}|\  }}}}||vrd}|                    ||          }t!          ||           r7||         }t/          j        t2          dt4          f         |j                  } ||||||d          }|r|s|}	n|                    |            np|r|	                    d          r|	|vr7t9          |	|          }	|	Kt!          |	|           r]||	         }t%          |dt:                    } |||
           |s|                    |	           |S )N)r   r  r   )r   r  r3   )rL  r   r   )rL  r   r3   ))r\   r   ro   )r\   r  rp   )r\   r  rq   r  r   r  r   r  zrotary_emb.inv_freqzmlp.shared_expertszmlp.experts.rL  z.biasFr3   r   zdown_proj.weightzShared expert weight dim z not divisible by num_chunks T.)shard_id	expert_idreturn_successweight_loader)r	   r   rf  extendr   r  rB   r   r   r   dictnamed_parametersset#get_spec_layer_idx_from_weight_namereplaceendswithr9   r  r   ndimr   rangeslicetypingcastr   r   addr*   r)   )r`   rH  $rocm_aiter_moe_shared_expert_enabledstacked_params_mappingmla_params_mappingmha_params_mappingexpert_params_mappingparams_dictloaded_paramsnameloaded_weight
spec_layer"is_fusion_moe_shared_experts_layer
param_nameweight_namer  name_mappedparamr  is_expert_weight
num_chunks	split_dimtotal
chunk_sizej
chunk_nameweight_to_loadchunk_slicemappingr  successs                                  rd   load_weightsz"DeepseekV2ForCausalLM.load_weights  sF   ?AA 	-
 -*"
 09

 
 

 < 	>"))*<===="))*<=== !/ I + +'4 8,,	 #'"<!
 !
 !
 4002233"%%%#* a	( a	(D-$,,<T[$OOJ% 5W:NRV:V / 6L S< S<1
Kd** #d**K0G0G5 "ll;
CC "444!44&D==)) d+.E.E*466 #D) % 3e]H===#(  
5 5!(6H!!L!L!QPQJ /$66=;MPQ;Q;Q  
 */	:E :-222DE D D7AD D 322 "'*!4Jz** R< R<A!%J%2N9 &+A
NQUj<P&Q&Q(-22-:;-GNN&!^^-:;>-JNN-:111k>-JN &*\\0M4;+G!+KMM& &
 $9 << <<GND
KH&j88$ ,0( '1&8&8j&Q&Q2;EE %$ +K 8 )/$S$Y/1D) ) #0-!*'%-&/+/# # # # "#E ?'2 - 1 1+ > > >!E" , % %  ==11 %d+6M6M$  9{KK<$24>> %$ +D 1(/!?4I) ) &e];;;5 (!!$'''re   rt  )ru   rv   rw   r  rv  r  r   rz   rS   r  r{   r|   r  r-   rt   r  r  tuplery   r  r   r  r  r}   r~   s   @rd   r  r    s        	i0  IAC 1" 1" 1"z 1"3 1" 1" 1" 1" 1" 1"f1 1 1*5 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
| 
	   

DsCc/A)B$C 

 

 

 

LHU33D-E$F L3s8 L L L L L L L Lre   r  c                       e Zd ZdS )DeepseekForCausalLMNru   rv   rw   r   re   rd   r  r            Dre   r  c                       e Zd ZdS )DeepseekV3ForCausalLMNr  r   re   rd   r  r    r  re   r  rB   r  c                     t          | d          rL| j        dk    rA| j        }t          | j                  D ]%}|                    d||z    d          r||z   c S &d S )Nnum_nextn_predict_layersr   zmodel.layers.rZ  )rP  r  r  r  
startswith)rB   r  rc  is       rd   r  r    s     	233%+a//,	v677 	% 	%A%%&Fi!m&F&F&FGG % 1}$$$%4re   )r3   r3   )prx   r  collections.abcr   r   	itertoolsr   r{   r   transformersr   r   vllm._aiter_opsr	   vllm.attention.layerr
   vllm.compilation.decoratorsr   vllm.configr   r   r   r   vllm.distributedr   r   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   /vllm.model_executor.layers.attention_layer_baser   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   r   !vllm.model_executor.layers.linearr   r   r   r   r   +vllm.model_executor.layers.logits_processorr    vllm.model_executor.layers.mlar!   r"   'vllm.model_executor.layers.quantizationr#   7vllm.model_executor.layers.quantization.utils.fp8_utilsr$   +vllm.model_executor.layers.rotary_embeddingr%   .vllm.model_executor.layers.sparse_attn_indexerr&   3vllm.model_executor.layers.vocab_parallel_embeddingr'   r(   -vllm.model_executor.model_loader.weight_utilsr)   r*    vllm.model_executor.models.utilsr+   vllm.platformsr,   vllm.sequencer-   vllm.v1.attention.backendr.   r<  r0   vllm.v1.kv_cache_interfacer1   r2   
interfacesr4   r5   r6   r7   utilsr8   r9   r:   r;   r<   ru   r  Moduler>   r   r   r  r   ry   r|   r   r   r  r%  rJ  rW  rv  r  r  r  r  rz   r  r   re   rd   <module>r      s  2 2 1  . . . . . . . .              ; ; ; ; ; ; ; ; * * * * * * * * * * * * = = = = = = X X X X X X X X X X X X              $ # # # # # < < < < < < N N N N N N ? ? ? ? ? ? C C C C C C C C              H G G G G G V V V V V V V V F F F F F F      A @ @ @ @ @ L L L L L L               E D D D D D + + + + + + - - - - - - 6 6 6 6 6 6      E D D D D D D D Q Q Q Q Q Q Q Q Q Q Q Q              
X		N N N N N	 N N Nb, , , , ,BI , , ,^Z@ Z@ Z@ Z@ Z@BI Z@ Z@ Z@z0 05 0e 0E 0 0 0 0$&)$9>$KP<$
\$ $ $ $b b b b b") b b bJ) ) ) ) )eho/A ) ) ):wA wA wA wA wAbi wA wA wAtvH vH vH vH vHRY vH vH vHrC' C' C' C' C'RY C' C' C'L _ _ _ _ _bi _ _ _D&, &, &, &, &,!1 &, &, &,R~ ~ ~ ~ ~Iz5|]~ ~ ~B
	 	 	 	 	/ 	 	 		 	 	 	 	1 	 	 	//>A4Z     re   