
    .`iS                     (   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z( ddl&m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZBmCZC ddlDmEZEmFZFmGZG ddlHmIZJ ddlKmLZL ddlMmNZN ddlOmPZP dd lQmRZR dd!lSmTZT dd"lUmVZVmWZW dd#lXmYZY dd$lZm[Z[ dd%l\m]Z] d&d'l^m_Z_m`Z`maZambZbmcZc d&d(ldmeZemfZfmgZgmhZhmiZimjZjmkZk  e el          Zmenejo        ejo        f         Zp G d) d*ejq                  Zr G d+ d,ejq        e2          Zs G d- d.ejq                  Zt G d/ d0ejq                  Zue G d1 d2ejq                              Zv G d3 d4ea          Zw G d5 d6ejq        e_ebecewe`          Zxd7ejo        d8ejo        d9ejo        d:ejo        d;eyd<dfd=Zzd7ejo        d8ejo        d9ejo        d:ejo        d;eyd<dfd>Z{ eYd?ezd:ge{@           eWj|        dAeVj}        dBeVj}        dCeVj}        dDeVj}        fdE            Z~	 	 dKdHejo        d9ejo        d8ejo        dIejo        dBedCed<enejo        ejo        f         fdJZdS )LzInference-only Qwen3Next model.    )Iterable)isliceN	rearrange)nn)ACT2FN)	Attention)support_torch_compile)CacheConfigModelConfigSpeculativeConfig
VllmConfigget_current_vllm_config)divideget_ep_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)ForwardContextget_forward_context)init_logger)chunk_gated_delta_rule fused_recurrent_gated_delta_rule)SharedFusedMoE)GemmaRMSNorm)RMSNormGated)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)	MambaBase)mamba_v2_sharded_weight_loader)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)causal_conv1d_fncausal_conv1d_update)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_namesharded_weight_loader)Qwen2MoeMLP)sequence_parallel_chunk)set_weight_attrs)current_platform)IntermediateTensors)Qwen3NextConfig)tltriton)direct_register_custom_op)AttentionMetadata)GDNAttentionMetadata   )HasInnerStateIsHybridMixtureOfExpertsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   N     e Zd Zddedef fdZdej        dej        fdZ xZ	S )	Qwen3NextSparseMoeBlock vllm_configprefixc                    t                                                       |j        j        }|j        }|j        }t                      | _        t                      j	        | _
        t                      j        | _        | j
                                        | _        |j        | _        |j        | _        | j        |j        k    r t'          d| j         d|j         d          t)                      }|j        j        }|j        | _        | j        | _        |j        | _        | j        | j        z   | _        | j        | j        z  | _        | j        | j        z  | _        | j        | j        z   | _        t=          |j        |j        d|| d          | _         t=          |j        ddd | d          | _!        |j"        d	k    r3tG          |j        |j"        |j$        |d| j!        | d
          | _%        nd | _%        tM          | j%        | j         | j        |j'        |j        |j(        d|j)        || d| j        | j        | j                  | _*        d S )NzTensor parallel size z' is greater than the number of experts .Fz.gatebiasquant_configrN   r=   z.shared_expert_gater   z.shared_expert)hidden_sizeintermediate_size
hidden_actrS   reduce_resultsexpert_gaterN   z.experts)shared_expertsgatenum_expertstop_krT   rU   rW   renormalizerS   rN   enable_eplbnum_redundant_expertsis_sequence_parallel)+super__init__model_config	hf_configparallel_configrS   r   tp_sizer   device_groupep_grouprank_in_groupep_ranksizeep_sizer[   n_routed_expertsuse_sequence_parallel_moer`   
ValueErrorr   eplb_configr^   n_logical_expertsr_   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr    rT   rZ   shared_expert_gateshared_expert_intermediate_sizeQwen3NextMLPrV   shared_expertr   num_experts_per_tokmoe_intermediate_sizenorm_topk_probexperts)selfrM   rN   configre   rS   rp   	__class__s          y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/qwen3_next.pyrb   z Qwen3NextSparseMoeBlock.__init__i   s   )3%5"/;==$3#~~3}))++ & 2$3$M!<&,,,? ? ?)/);? ? ?   .//!1=*6!%!6#.#D "&"84;S"S(,(?4<(O%%)\D4Q%Q"&)FF 	  %%###
 
 
	 #3111#
 #
 #
 1A55!-"."("H!,)$ 3 000" " "D "&D%--,*$: -%&&&("&":!%!:
 
 
    hidden_statesreturnc                 
   |j         }|j         \  }}|                    d|          }| j        rt          |          }| j        j        r|                     ||          }n/|                     |          \  }}|                     ||          }| j        |d         |d         z   }| j        rt          |d          }|d |         }n%| j	        dk    r| j        
                    |          }|                    |          S )N)r   router_logitsr   r=   )shapeviewr`   r3   r~   is_internal_routerrZ   rz   r   rf   &maybe_all_reduce_tensor_model_parallel)r   r   
orig_shape
num_tokens
hidden_dimfinal_hidden_statesr   _s           r   forwardzQwen3NextSparseMoeBlock.forward   s<   "(
!.!4
J%**2z::$ 	C3MBBM<* 
	"&,,+= #/ # #
  $yy77M1"&,,+= #/ # # )"5a"8;Nq;Q"Q$ 	"B#Q# # #6kzk"B\A"&,"U"U## # #''
333r   rL   )
__name__
__module____qualname__r   strrb   torchTensorr   __classcell__r   s   @r   rK   rK   h   s        P
 P
J P
 P
 P
 P
 P
 P
 P
d"4U\ "4el "4 "4 "4 "4 "4 "4 "4 "4r   rK   c                   X    e Zd Zedefd            Zdeej        ej        f         fdZ	deee
df         ee
df         f         fdZ	 	 	 	 	 dded	edz  d
edz  dedz  dedz  deddf fdZd Zd Zdej        dej        fdZdej        dej        dej        dej        fdZ xZS )Qwen3NextGatedDeltaNetr   c                     dS )Ngdn_attention r   s    r   
mamba_typez!Qwen3NextGatedDeltaNet.mamba_type   s    r   c                 T    t          j        | j        j        | j        j                  S Nr'   gated_delta_net_state_dtyperc   dtypecache_configmamba_cache_dtyper   s    r   get_state_dtypez&Qwen3NextGatedDeltaNet.get_state_dtype   s(    (D#T%6%H
 
 	
r   .c           	      |    t          j        | j        | j        | j        | j        | j        | j        | j                  S r   )	r(   gated_delta_net_state_shaperf   num_k_headsnum_v_heads
head_k_dim
head_v_dimconv_kernel_sizenum_specr   s    r   get_state_shapez&Qwen3NextGatedDeltaNet.get_state_shape   s=    (DLOO!M
 
 	
r   NrL   r   rc   r   rS   speculative_configrN   c           	         t                                                       t                      | _        t	                      | _        |j        | _        |j        | _        |j	        | _
        |j        | _        |j        | _        | j        | j
        z  | _        | j        | j        z  | _        |j        | _        t'          |          | _        |j        | _        t.          |j                 | _        |j        | _        || _        || _        || _        || _        || _        || _         | j         r| j         j!        nd| _"        | j        dz  | j        z   | _#        tI          | j        | j#        d| d          | _%        | j%        j&        j'        (                    d          | j%        j&        _'        | j        dz  | j        dz  z   | _)        | j        dz  | _*        tI          | j        | j)        d|| d          | _+        tI          | j        | j*        d|| d	          | _,        | j        ddf}| j        ddf}t[          | j%        j&        d
           t]          | j%        j&        d
t_          |||g| j        | j                  i           ta          j1        te          j3        | j        | j        z                      | _4        ta          j1        te          j5        tm          | j        | j                                      | _7        t]          | j7        d
tq          d          i           t]          | j4        d
tq          d          i           ts          | j        | j        d dtu          j;                    |j<                  | _=        t}          | j        | j        dd|| d          | _?        t                      jA        }	||	jB        v rt          d|           | |	jB        |<   d S )Nr      Fz.conv1d)
input_sizeoutput_sizerR   rN   r=   z.in_proj_qkvz)r   r   rR   rS   rN   z.in_proj_baweight_loaderT)eps
group_sizenorm_before_gatedevicer   z	.out_proj)rR   input_is_parallelrS   rN   zDuplicate layer name: )Dra   rb   r   rf   r   tp_rankrT   linear_num_value_headsr   linear_num_key_headsr   linear_key_head_dimr   linear_value_head_dimr   key_dim	value_dimlinear_conv_kernel_dimr   rE   	layer_idxrV   
activationr   actrms_norm_epslayer_norm_epsilonrN   r   rc   r   rS   r   num_speculative_tokensr   conv_dimr   conv1dweightdata	unsqueezeprojection_size_qkvzprojection_size_bain_proj_qkvz
in_proj_badelattrr4   r$   r   	Parameterr   onesdt_biasemptyr   A_logr1   r   r5   current_devicer   normr!   out_projr   compilation_configstatic_forward_contextro   )r   r   rc   r   rS   r   rN   query_key_settingsvalue_settingsr   r   s             r   rb   zQwen3NextGatedDeltaNet.__init__   s    	;==577!-!8!6 4 6)994+;; & =,V44 +&+,"("5((("4 &D#:: 	 q(4>9*,%%%	
 
 
 #'+"4"9"C"CA"F"F %)L1$4t~7I$I!"&"2Q"60'1%+++
 
 
 /'/%)))
 
 
 #lAu5.!U3"O444K!?**&
 LL" "
	
 	
 	
& |Jt'4<788
 
 \Kt'66 
 

 	o7LQ7O7O%PQQQ9Nq9Q9Q'RSSS O'!#244,
 
 
	 *N"%'''
 
 
 566I'>>>>f>>???<@1&999r   c                    |                                 dd         | j        | j        z  | j        | j        z   | j        | j        z   | j        z  | j        z  z   fz   }|                                 dd         | j        | j        z  d| j        z  | j        z  fz   } |j        | } |j        | }| j        | j        | j        | j        z  | j        z  | j        | j        z  | j        z  g}| j        | j        z  | j        | j        z  g}t          j        ||d          \  }}}	}
t          j        ||d          \  }}|		                    |	                     d          d| j                  }	|
	                    |
                     d          d| j                  }
|	                    |                     d          | j        | j        z            }|	                    |                     d          | j        | j        z            }|||	|
||fS )zQ
        Derives `query`, `key` and `value` tensors from `mixed_qkvzba`.
        Nr   r   dimr   )
rk   r   rf   r   r   r   r   r   splitreshape)r   
mixed_qkvzmixed_banew_tensor_shape_qkvznew_tensor_shape_basplit_arg_list_qkvzsplit_arg_list_baquerykeyvaluezbas                r   fix_query_key_value_orderingz3Qwen3NextGatedDeltaNet.fix_query_key_value_orderingr  s    !+ 1 1#2# 6,/"?T_4"##$$	:
 	!
 )oo//4,  D$448
 

 %Z_&;<
 8="56 OO!11DOC!11DOC	
  00 00
 "'Z9LRS!T!T!TUAX'8a@@@A ejjmmRAAIIaffQiiT_55IIaffQii!1T\!ABBIIaffQii!1T\!ABBc5!Q))r   c                 j    |dS t          j        | j         j        z   j         j        z   j         j        z  gd          \  }}}t           fd||f          \  }}t          |d j                  }|                                |                                |                                fS )N)NNNr   r   c                 2    t          | dj                  S )Nl (h d) -> 1 l h dd)r   r   )xr   s    r   <lambda>z<Qwen3NextGatedDeltaNet.rearrange_mixed_qkv.<locals>.<lambda>  s    i#74?KKK r   r   r   )	r   r   r   rf   r   mapr   r   
contiguous)r   	mixed_qkvr   r   r   s   `    r   rearrange_mixed_qkvz*Qwen3NextGatedDeltaNet.rearrange_mixed_qkv  s    ##!K,,$,.
 
 
 
sE KKKKCL
 

s %!5III!!3>>#3#3U5E5E5G5GGGr   r   outputc                 4   |                     d          }|                     |          \  }}|                     |          \  }}|                     ||          \  }}}	}
}}t	          d |||	f          \  }}}	t          j        |||	fd          }t          j        || j        | j	        z  | j
        f|j        |j                  }t
          j        j                            ||||| j                   |
j        }|                    d|j        d                   }|
                    d|
j        d                   }
|                     ||
          }|                    |          }t)          |d          }|                     |          \  |d|<   }dS )z
        Forward pass with three parts:
        1. Input projection
        2. Core attention (custom op)
        3. Output projection
        r   c                 "    t          | d          S )Nzl p d -> l (p d)r   )r   s    r   r   z0Qwen3NextGatedDeltaNet.forward.<locals>.<lambda>  s    i#566 r   r   r   r   r   z... h d -> ... (h d)N)rk   r   r   r   r   r   catzerosr   rf   r   r   r   opsvllmgdn_attention_corerN   r   r   r   r   r   )r   r   r   r   projected_states_qkvzr   projected_states_bar   r   r   r   r   r   r   core_attn_out
z_shape_ogs                   r   r   zQwen3NextGatedDeltaNet.forward  s    #''**

 $(#4#4]#C#C q!%!?!?Q%)%F%F!#6&
 &
"sE1a  66U8K
 
sE Iuc51r:::	 )T\94?K% '
 
 
 		))K	
 	
 	
 W
%--b-2Eb2IJJIIb!'"+&&		-33%--j99!-1GHH!%}!=!={
{QQQr   r   r   r   r  c                 	   t                      }|j        }|dS t          |t                    sJ || j                 }t          |t
                    sJ |j        }|j        }|j        }	|j	        }
|j
        }|j        }|j        }|j        }| j        |j                 }|d                             dd          }|d         }|j        }|j        }|d|         }|d|         }|d|         }| j        j                            | j        j                            d          | j        j                            d                    }|
H|j        dk    r|j        dk    r|}d}n1|                    d|          }|                    d|          }nd}|}|
Rt5          |||| j        j        | j        |dddf         d|j                 |||                    d          d
  
        }|j        dk    rR|                    dd          }t=          ||| j        j        | j        ||||	|		  	                            dd          }n@|j        dk    r3t5          |||| j        j        | j        |d|j                 d
          }nd}|                     |          \  }}}|                     |          \  }}}tA          | j!        ||| j"                  \  }}|
x|j        dk    r|j        dk    r	|} |}!d}"d}#na|                    d|          } |                    d|          }!|                    d|          }"|                    d|          }#nd} d}!|}"|}#|
.tG          |||| |!|d
|d|j        dz            ||d
          \  }$}%nd\  }$}%|j        dk    r\||         $                                }&d|&| df<   tK          ||||"|#|&d
|	dd

  
        \  }'}%|%&                    |j'                  ||<   n=|j        dk    r-tG          ||||"|#|d
|	d|j        dz            |d

  
        \  }'}%nd\  }'}%|
|'}tQ          j)        d|g|$j*        dd         R |'j'        |'j+                  }(|(,                    d||$           |(,                    d||'           |(-                    d          |d|<   dS |
|$-                    d          |d|<   dS |'-                    d          |d|<   dS )zC
        Core attention computation (called by custom op).
        Nr   r   r=   r   F)conv_state_indicesnum_accepted_tokensquery_start_locmax_query_lenvalidate_data)r   conv_stateshas_initial_statecache_indicesr  metadataT)r  r  )qkvgbetainitial_stateinplace_final_state
cu_seqlensssm_state_indicesr  use_qk_l2norm_in_kernelNN.)
r  r  r  r  r  r  output_final_stater  
head_firstr!  )
r  r  r  r  r  r  r  r  r   r!  r  ).r   attn_metadata
isinstancedictrN   r<   r  spec_query_start_locnon_spec_query_start_locspec_sequence_masksspec_token_indxnon_spec_token_indxspec_state_indices_tensornon_spec_state_indices_tensorkv_cachevirtual_engine	transposenum_actual_tokensr  r   r   r   rk   num_prefillsnum_decodesindex_selectr*   rR   r   num_spec_decodesr)   r   fused_gdn_gatingr   r   r   r   r   tor   r   r   r   r   index_copy_squeeze))r   r   r   r   r  forward_contextr%  r  r(  r)  r*  r+  r,  r-  r.  self_kv_cache
conv_state	ssm_stater2  r  conv_weightsmixed_qkv_specmixed_qkv_non_specmixed_qkv_non_spec_T
query_speckey_spec
value_specquery_non_speckey_non_specvalue_non_specr  r  g_spec	beta_spec
g_non_specbeta_non_speccore_attn_out_speclast_recurrent_stater  core_attn_out_non_spec
merged_outs)                                            r   _forward_corez$Qwen3NextGatedDeltaNet._forward_core  sv    .//+:+H F-.....%dk2-)=>>>>>);,A#0#I +?'7+?$1$K!(5(S%o&DE"1%//B77
!!$	);+?0001	   !   ! {)..K##A&&(:(?(?(B(B
 
 *)Q..=3LPQ3Q3Q!*%)""!*!7!7?!K!K%.%;%;A?R%S%S""!N!* *1 #<QQQT#B4m44$ %8 47<<R@@#  N  %))#5#?#?1#E#E  "2$ ?&"3; 8&
" 
" 
" i1oo  &**!5" #@5m55$ #
" 
" 
" "&+/+C+CN+S+S(
Hj7;7O7O8
 8
4n #4:q!T\BB4*)Q..=3LPQ3Q3Q 	!
 $?;; --aAA	^^A/BCC
 $ 1 1!5H I IFIJ M
 *7W'$(/0T-2PST2T0TU";$7(,8 8 84 4 4 8B4 4 %))%&CDOOQQM56M,,c12 '  "+#'3 (,  &$ 8L7N7N8 8I344 &**0$"$ &"+(,77-3a77  'D,0   9"$8$8" <F8"$8 */E/Q%E(:(@(DEE,2-4  J
 ""1o7IJJJ""1&9;QRRR0:0B0B10E0EM,,,--- ,0B0J0J10M0MM,,,---0F0N0Nq0Q0QM,,,---r   )NNNNrL   )r   r   r   propertyr   r   tupler   r   r   intr   r7   r   r   r+   r   rb   r   r   r   r   rQ  r   r   s   @r   r   r      s       C    X
u{EK'?!@ 
 
 
 

	
uS#Xc3h'G!H 	
 	
 	
 	
 ,0+/267;{A {A{A "D({A "D(	{A
 )4/{A .4{A {A 
{A {A {A {A {A {Az1* 1* 1*fH H H&7>|7> 7> 7> 7> 7>rJR<JR <JR <	JR
 |JR JR JR JR JR JR JR JRr   r   c                        e Zd Z	 	 	 	 ddededz  dedz  dedz  deddf fd	Zd
e	j
        de	j
        de	j
        fdZ xZS )Qwen3NextAttentionNrL   r   rc   r   rS   rN   r   c                    t                                                       || _        |j        | _        t	                      }|j        | _        | j        |z  dk    sJ | j        |z  | _        |j        | _	        | j	        |k    r| j	        |z  dk    sJ n|| j	        z  dk    sJ t          d| j	        |z            | _        |j        p| j        | j        z  | _        | j        | j        z  | _        | j        | j        z  | _        | j        dz  | _        t!          |dd           | _        t!          |dd          | _        t'          |j        | j        | j        d| j        z   z  | j	        t!          |dd          || d	
          | _        t+          | j        | j        z  |j        d|| d
          | _        t/          | j        |j        |j        | j                  | _        t7          | j        | j        | j        f| j        ||| dd| j        rt9          |          | j        dni | _        t=          | j        |j                  | _         t=          | j        |j                  | _!        d S )Nr   r=   g      dual_chunk_attention_configattn_output_gateTqkv_biasFz	.qkv_projrQ   z.o_proj)	head_sizemax_positionrope_parametersrX  z.attn)num_kv_headsr   rS   rN   )r   rX  r   )"ra   rb   r   rT   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr^  head_dimq_sizekv_sizescalinggetattrrX  rY  r   qkv_projr!   o_projr,   max_position_embeddingsr]  
rotary_embr	   rE   attnQwen3NextRMSNormr   q_normk_norm)r   r   rc   r   rS   rN   rf   r   s          r   rb   zQwen3NextAttention.__init__  s    	!-688%9#g-2222-8"("<"g-- *W499999 T4499994#:g#EFFOD,<,Nnt}4(4=8}d*+214,
 ,
( !(0BD I I)M A(=$=>#U33%'''
 
 
 ( 4=0%%%%
 
 
 #m7"2(,(H	
 
 
 NML
 *%%###
 
 /	088/3/O  
 
 
	  't}&:MNNN&t}&:MNNNr   	positionsr   r   c                    |                      |          \  }}| j        r|                    | j        dz  | j        | j        gd          \  }}}|j        d d         }	 |j        g |	| j        dR  }t          j	        |dd          \  }
} |
j
        g |	dR  }
 |j
        g |	dR  }n-|                    | j        | j        | j        gd          \  }
}}|                     |
                    d| j        | j                                                d| j        | j        z            }
|                     |                    d| j        | j                                                d| j        | j        z            }|                     ||
|          \  }
}|                     |
||          }| j        rt          j        |          }||z  }|                     |          \  |d d <   }d S )Nr   r   r   )rk  rY  r   rg  rh  r   r   rb  r   chunkr   rq  rf  rr  r^  rn  ro  sigmoidrl  )r   rs  r   r   qkvr   q_gater  r  r   r  rZ   attn_outputs                r   r   zQwen3NextAttention.forward  s	    }--Q  
	S99q$,=2 %  LFAq  crc*J V[A*AdnAbAAAFk&!444GAt	*:*r***A4<00R000DDiidlDL IriRRGAq!KKr4>4=AABBGG.
 
 KKr4#4dmDDEEJJ!DM1
 
 y!Q//1ii1a((  	-=&&D%,K{{;//qqq	111r   )NNNrL   )r   r   r   r7   r   r   r+   r   rb   r   r   r   r   r   s   @r   rV  rV    s         ,0+/26LO LOLO "D(LO "D(	LO
 )4/LO LO 
LO LO LO LO LO LO\#0<#0 #0 |	#0 #0 #0 #0 #0 #0 #0 #0r   rV  c            	       t     e Zd Z	 ddedededdf fdZ	 ddej        d	ej        dz  d
ej        defdZ	 xZ
S )Qwen3NextDecoderLayerrL   rM   
layer_typerN   r   Nc           	      |   t                                                       |j        j        }|j        }|j        }|j        }|j        }|| _        t          |          | _	        | j        dk    rt          |||||| d          | _        n?| j        dk    rt          ||||| d          | _        nt          d| j                   t          |d          sg n|j        }	| j	        |	vr;|j        d	k    r0| j	        d
z   |j        z  d	k    rt'          || d          | _        n+t+          |j        |j        |j        || d          | _        t3          |j        |j                  | _        t3          |j        |j                  | _        t;          |dd          | _        | j        rt>          j         !                    t?          j"        d
d
|j        |j#                            | _$        t>          j         !                    t?          j"        d
d
|j        |j#                            | _%        d S d S )Nlinear_attentionz.linear_attn)rc   r   rS   r   rN   full_attentionz
.self_attn)rc   r   rS   rN   zInvalid layer_type mlp_only_layersr   r=   z.mlprM   rN   )rT   rU   rV   rS   rN   r_  layer_scaleF)r   )&ra   rb   rc   rd   r   rS   r   r|  rE   r   r   linear_attnrV  	self_attnro   hasattrr  r[   decoder_sparse_steprK   mlpry   rT   rU   rV   rp  r   input_layernormpost_attention_layernormrj  r  r   r   r   r  r   attn_layer_scaleffn_layer_scale)r   rM   r|  rN   r   rc   r   rS   r   r  r   s             r   rb   zQwen3NextDecoderLayer.__init__4  s    	)3"/"/"/(;$,V44?0005)))#5 ...     D _ 000/))) ,,,  DNN D4?DDEEE f&788TBBf>T 	 N/11""!#v'AAQFF.'   DHH
 $"."(":!,)   DH  0F$7 
  
  
 )9F$7)
 )
 )
% #6=%@@ 	$)H$6$6& ,	  % %D! $)8#5#5& ,	  $ $D   	 	r   r   residualrs  kwargsc                 4   ||}|                      |          }n|                      ||          \  }}t          j        |          }| j        dk    r|                     ||           n3| j        dk    r|                     |||           nt          d          |}| j        rit          |j	                  dk    r,|| j
                            |j                  d         dz   z  }n%|| j
                            |j                  dz   z  }|                     ||          \  }}|                     |          }| j        rt          |j	                  dk    r,|| j                            |j                  d         dz   z  }nt          |j	                  t          | j        j	                  k    s9J d	t          |j	                   d
t          | j        j	                               || j                            |j                  dz   z  }||fS )Nr~  )r   r   r  )r   r   rs  zInvalid layer_typer   r   r=   zshape must be the same z, )r  r   
empty_liker|  r  r  ro   r  lenr   r  r8  r   r  r  r  )r   r   r  rs  r  self_attention_outputs         r   r   zQwen3NextDecoderLayer.forward  sf    $H 00??MM&*&:&:=(&S&S#M8 % 0 ? ??000+,      _ 000NN+,#      1222- 	=&''1,, -),,]-@AA!DqH! !.),,]-@AAAE!
 #'"?"?x"X"Xx// 	=&''1,, -(++M,?@@CaG! =.//3t7K7Q3R3RRRR9c-2E.F.F 9 94/5669 9 SRR !.(++M,?@@1D! h&&r   r   r   )r   r   r   r   r   rb   r   r   objectr   r   r   s   @r   r{  r{  3  s        
 	Q QQ Q 	Q
 
Q Q Q Q Q Qn #'	9' 9'|9' ,%9' <	9'
 9' 9' 9' 9' 9' 9' 9' 9'r   r{  c                       e Zd Zdddedef fdZdej        dej        fdZ	 	 ddej        d
ej        de	d	z  dej        d	z  dej        f
dZ
deeeeeef                  fdZdeeeej        f                  dee         fdZ xZS )Qwen3NextModelrL   rN   rM   rN   c                :   t                                                       j        j        j        }|j        }|j        | _        | _        j        | _        t          | j        j
                  | _        dt          ffd}t          j        || d          \  | _        | _        | _        t%          ddgj
                  | _        t)                      j        r"t-          j
        j                  | _        d S t3                      | _        d S )NrN   c                 X    t          j        t          |                    |           S )N)r|  rN   )r{  layer_typesrE   )rN   r   rM   s    r   	get_layerz*Qwen3NextModel.__init__.<locals>.get_layer  s4    (!-.A&.I.IJ   r   z.layersr  r   r  r_  )ra   rb   rc   rd   re   rp   r_   r   
vocab_sizer.   rT   embed_tokensr   rH   num_hidden_layersstart_layer	end_layerlayersrG   make_empty_intermediate_tensorsr   is_last_rankrp  r   r   rD   )r   rM   rN   re   rp   r  r   r   s    `    @r   rb   zQwen3NextModel.__init__  s3   "-":"D%5%1%0%F" +2O
 

	c 	 	 	 	 	 	 	 9D$i68J8J8J9
 9
 9
5$.$+ 0Wj)6+=0
 0
, >>& 	)();ATUUUDIII&((DIIIr   	input_idsr   c                 ,    |                      |          S r   )r  r   r  s     r   embed_input_idszQwen3NextModel.embed_input_ids  s      +++r   Nrs  intermediate_tensorsinputs_embedsc                 r   t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        | j        | j                  D ]} ||||          \  }}t                      j        st          ||d          S | 	                    ||          \  }}|S )Nr   r  )rs  r   r  )r   r  )
r   is_first_rankr  r   r  r  r  r  r6   r   )	r   r  rs  r  r  r   r  layerr   s	            r   r   zQwen3NextModel.forward  s     >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7HDK)94>JJ 	 	E&+e#+!' ' '#M88 ~~* 	&"/XFF    99]H==qr   c                 T    t          j        | ddd| j        j        | j                  S )N	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer[   r_   )r   make_expert_params_mappingr   r[   r_   r   s    r   get_expert_mappingz!Qwen3NextModel.get_expert_mapping  s8     8 + +'/"&"<
 
 
 	
r   weightsc           	      T   g d}t          |                                           }t                      }|                                 }|D ]\  }}d|v r|                    d          r!|                    d          rt          ||          }|I|D ]t\  }}	}
|	|vrd|v r|                    |	|          }|                    d          r||vr@t          ||           rQ||vrV||         }|j	        } ||||
            n|D ]}|\  }}	}}
|	|vr|                    |	|          }t          ||           r5|                    d          s|                    d          r||vrd||vri||         }|j	        } |||||
|            n{|                    d          r||vrgt          ||           ry||vr t                              d	| d
           ||         }t          |dt                    } |||           |                    |           |S )N))rk  q_projr  )rk  k_projr  )rk  v_projr  )gate_up_projr  r   )r  r  r=   zrotary_emb.inv_freqmtp.scalezmlp.expertsz.bias_bias)shard_id	expert_idz
Parameter z' not found in params_dict, skip loadingr   )r'  named_parameterssetr  
startswithendswithr0   replacerF   r   loggerwarning_oncerj  r/   add)r   r  stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer  paramr   mappingr  s                  r   load_weightszQwen3NextModel.load_weights  s    "
 "
 "
 4002233"%%% $ 7 7 9 9#* L	$ L	$D-$,,v&&  }}W%% 0{CC<5K >8 >81
Kd** D((||K<<==)) d+.E.E*466 {**#D) % 3e]H===4 (8 (8GCJ@JY"$.. <<Z@@D.tT:: !  g..!26--2H2H!k11 ;.. '-E$)$7M!M%!)"+    E }}W-- !$k2I2I .tT:: ! ;..++VVVV   !'-E$+0E% %M "M%777d####r   r"  )r   r   r   r   r   rb   r   r   r  r6   r   listrS  rT  r  r   r  r  r   r   s   @r   r  r    sS       AC #) #) #)z #)3 #) #) #) #) #) #)J, ,%, , , , , <@-1 < < 2D8	
 |d* 
   @

DsCc/A)B$C 

 

 

 

ZHU33D-E$F Z3s8 Z Z Z Z Z Z Z Zr   r  c                   (    e Zd ZdededdfdZd ZdS )QwenNextMixtureOfExpertsnum_physical_expertsnum_local_physical_expertsr   Nc                     | j         |k    sJ || _        || _         || j        z
  | _        | j        j        D ]V}t          |j        t                    r:|j        }||_	        ||_
        | j        |_        |j                                         Wd S r   )r  r  num_logical_expertsr_   modelr  r&  r  rK   rt   rs   rr   r~   update_expert_map)r   r  r  r  moes        r    update_physical_experts_metadataz9QwenNextMixtureOfExperts.update_physical_experts_metadataw  s    
 .2LLLLL$8!*D'%9D<T%T"Z& 	0 	0E%)%<== 0i/I,)=&*.*D'--///	0 	0r   c                    g | _         g | _        d }| j        j        D ]\}t	          |t
                    rEt	          |j        t                    r+|j        }| j                            |j        j	                   ]|t          d          t          | j                  | _        d| _        d| _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        d S )Nz-No Qwen3Next layer found in the model.layers.r=   r   )expert_weights
moe_layersr  r  r&  r{  r  rK   appendr~   RuntimeErrorr  num_moe_layersnum_expert_groupsnum_shared_expertsrq   r  rs   r  rt   r  rm   num_routed_expertsrr   r_   )r   example_moer  s      r   set_moe_parametersz+QwenNextMixtureOfExperts.set_moe_parameters  s     Z& 	: 	:E%!677 :J	2= = : $i&&uy'8999NOOO "$/22!""##.#@ $/$B!*5*N'"-">%0%D"""r   )r   r   r   rT  r  r  r   r   r   r  r  v  sX        0!0 %(0 
	0 0 0 0"E E E E Er   r  c                       e Zd Zg dddgdZdddedef fd	Zd
ej        dej        fdZ		 	 dd
ej        dej        de
dz  dej        dz  def
dZedddeej        ej        f         fd            Zedddeeeef         eeef         f         fd            Zedeeef         fd            Zdej        dej        dz  fdZdeeeej        f                  dee         fdZdeeeeeef                  fdZ xZS )Qwen3NextForCausalLM)r  r  r  r  r  )rk  r  rL   r  rM   rN   c                ,   |j         j        }|| _        |j         | _         |j        }|j        }|j        dk    rt          d          |j        | _        t                      	                                 || _
        || _        t          |t          |d                    | _        t          |j        |j        t          |d                    | _        t%          |j                  | _        | j        j        | _        |                                  d S )NallzhQwen3Next currently does not support 'all' prefix caching, please use '--mamba-cache-mode=align' insteadr  r  lm_headr  )rc   rd   rM   r   scheduler_configmamba_cache_modeNotImplementedErrorrS   ra   rb   r   r  rI   r  r-   r  rT   r  r"   logits_processorr  r  )r   rM   rN   r   r   r  r   s         r   rb   zQwen3NextForCausalLM.__init__  s   )3&'4"/&7(E11%@   (4 0##L,I,I
 
 

 &	22
 
 

 !00A B BJ6 	,
 	!!!!!r   r  r   c                 6    | j                             |          S r   )r  r  r  s     r   r  z$Qwen3NextForCausalLM.embed_input_ids  s    z)))444r   Nrs  r  r  r  c                 6    |                      ||||          }|S r   )r  )r   r  rs  r  r  r  r   s          r   r   zQwen3NextForCausalLM.forward  s)     

y"6
 
 r   r   c                 T    t          j        |j        j        |j        j                  S r   r   )clsrM   s     r   !get_mamba_state_dtype_from_configz6Qwen3NextForCausalLM.get_mamba_state_dtype_from_config  s*    
 )D$*K,D,V
 
 	
r   c           	          |j         }|j        j        }|j        }|j        r|j        j        nd}t          j        ||j        |j	        |j
        |j        |j        |          S )Nr   )re   rc   rd   tensor_parallel_sizer   r   r(   r   r   r   r   r   r   )r  rM   re   rd   rf   r   s         r   !get_mamba_state_shape_from_configz6Qwen3NextForCausalLM.get_mamba_state_shape_from_config  sz     &5,6	!6 -K*AA 	
 )D*,)+,
 
 	
r   c                 (    t          j                    S r   )r&   gated_delta_net_state_copy_func)r  s    r   get_mamba_state_copy_funcz.Qwen3NextForCausalLM.get_mamba_state_copy_func  s    +KMMMr   r   c                 8    |                      | j        |          S r   )r  r  )r   r   s     r   compute_logitsz#Qwen3NextForCausalLM.compute_logits	  s     $$T\=AAAr   r  c                 P    t          | dg          }|                    |          S )Nr  )skip_prefixes)rC   r  )r   r  loaders      r   r  z!Qwen3NextForCausalLM.load_weights  s4    "!(
 
 
 ""7+++r   c                 4    | j                                         S r   )r  r  r   s    r   r  z'Qwen3NextForCausalLM.get_expert_mapping  s    z,,...r   r"  )r   r   r   packed_modules_mappingr   r   rb   r   r   r  r6   r  r   classmethodrS  r   r  rT  r  r%   r  r  r   r  r  r  r  r   r   s   @r   r  r    sg       
 
 

 %i0  BD  "  "  "z  "3  "  "  "  "  "  "D5 5%, 5 5 5 5 <@-1 < < 2D8	
 |d*     
!
 
u{EK'	(
 
 
 [
 
&
	uS#Xc3h/	0
 
 
 [
* N%0BDV0V*W N N N [NB|B 
	B B B B,HU33D-E$F ,3s8 , , , ,/DsCc/A)B$C / / / / / / / /r   r  r   r   r   r  
layer_namer   c                 n    t                      }|j        |         }|                    | |||           dS )z
    Custom op for the core attention computation.
    Only handles the convolution + recurrent attention part.
    Input/output projections are handled outside this op.
    )r   r   r   r  N)r   no_compile_layersrQ  )r   r   r   r  r  r;  r   s          r   r  r    sO     ':&;&;O,Z8D

#	      r   c                     dS )z&Fake implementation for torch.compile.Nr   )r   r   r   r  r  s        r   gdn_attention_core_faker  0  s	     Fr   r  )op_nameop_funcmutates_args	fake_impl	NUM_HEADSr  	threshold	BLK_HEADSc                 j   t          j        d          t          j        d          t          j        d          }}}||
z  t          j        d|
          z   }||z  |z  ||z  z   |z   }||k     }t          j        ||z   |          }t          j        ||z   |          }t          j        ||z   |          }t          j        ||z   |          }|                    t           j                  |                    t           j                  z   }t          j        ||z  |	k    d|z  t          j        dt          j        ||z            z             z  |          }t          j        |                    t           j                             |z  }t          j	        | |z   |                    | j
        j                  |           t          j        |                    t           j                            }t          j	        ||z   |                    |j
        j                  |           d S )Nr   r=   r   )mask)r8   
program_idarangeloadr8  float32wherelogexpstorer   
element_tyrv  )r  beta_outputr   r   r   r   seq_lenr  r  r  r  i_bi_si_dhead_offoffr  	blk_A_logblk_ablk_bblk_biasr   
softplus_xblk_gblk_beta_outputs                            r   fused_gdn_gating_kernelr)  C  s    M!$$bmA&6&6a8H8HcCY1i!8!88H
-)
#cIo
5
@CiD(t444IGAG$'''EGAG$'''Eww)555Hx{{2:666AqIDBF1rvdQh7G7G3G,H,HH! J VILL,,---
:EHQWehhqw122>>>>j"*!5!566OHc?--k.?.JKKRV     r         ?      4@r   r   c                 *   |j         \  }}d}||t          j        |d          f}	t          j        d||t          j        |j                  }
t          j        d|||j        |j                  }t          |	         |
|| |||||||dd           |
|fS )z
    Fused computation of g and beta for Gated Delta Net.
    g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
    beta_output = b.sigmoid()
    TODO maybe use torch.compile to replace this triton kernel
    r=      r  )	num_warps)	r   r9   cdivr   r   r  r   r   r)  )r   r   r   r   r  r  batchrb  r  gridr  r  s               r   r7  r7  g  s     wE9G7FK	1556DAuiu}QXNNNA+a	RRRKD!				    k>r   )r*  r+  )__doc__collections.abcr   	itertoolsr   r   einopsr   r   transformers.activationsr   vllm.attention.layerr	   vllm.compilation.decoratorsr
   vllm.configr   r   r   r   r   vllm.distributedr   r   r   r   r   r   vllm.forward_contextr   r   vllm.loggerr   "vllm.model_executor.layers.fla.opsr   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   rp  r   !vllm.model_executor.layers.linearr   r   r    r!   +vllm.model_executor.layers.logits_processorr"   )vllm.model_executor.layers.mamba.abstractr#   -vllm.model_executor.layers.mamba.mamba_mixer2r$   ,vllm.model_executor.layers.mamba.mamba_utilsr%   r&   r'   r(   2vllm.model_executor.layers.mamba.ops.causal_conv1dr)   r*   'vllm.model_executor.layers.quantizationr+   +vllm.model_executor.layers.rotary_embeddingr,   3vllm.model_executor.layers.vocab_parallel_embeddingr-   r.   -vllm.model_executor.model_loader.weight_utilsr/   r0   r1   $vllm.model_executor.models.qwen2_moer2   ry    vllm.model_executor.models.utilsr3   vllm.model_executor.utilsr4   vllm.platformsr5   vllm.sequencer6   vllm.transformers_utils.configsr7   vllm.triton_utilsr8   r9   vllm.utils.torch_utilsr:   vllm.v1.attention.backendr;   #vllm.v1.attention.backends.gdn_attnr<   
interfacesr>   r?   r@   rA   rB   utilsrC   rD   rE   rF   rG   rH   rI   r   r  rS  r   KVCacheModulerK   r   rV  r{  r  r  r  r   r  r  jit	constexprr)  floatr7  r   r   r   <module>r[     s   & % $ $ $ $ $ $                    + + + + + + * * * * * * = = = = = =                             E D D D D D D D # # # # # #        @ ? ? ? ? ?      > = = = = =            H G G G G G ? ? ? ? ? ? X X X X X X                   G F F F F F @ @ @ @ @ @                
 M L L L L L D D D D D D 6 6 6 6 6 6 + + + + + + - - - - - - ; ; ; ; ; ; ( ( ( ( ( ( ( ( < < < < < < 7 7 7 7 7 7 D D D D D D                               
X		
el*
+u4 u4 u4 u4 u4bi u4 u4 u4p[R [R [R [R [RRY	 [R [R [R|r0 r0 r0 r0 r0 r0 r0 r0jM' M' M' M' M'BI M' M' M'` o o o o oRY o o od)E )E )E )E )E/ )E )E )EXu/ u/ u/ u/ u/Iu/ u/ u/p|| | <	
  
   ,|| | <	
  
      !"%	      |  ,  |  |       P ! !<!|! |! \	!
 ! ! 5<%&! ! ! ! ! !r   