
    .`i2f                     `   d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=  ee>          Z? G d dej@                  ZA G d dej@                  ZB G d dej@                  ZC eddddd            G d! d"ej@                              ZD G d# d$ej@        e-e,          ZEdS )%z?Inference-only AfMoE model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)	Attention)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)init_logger)SharedFusedMoE)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)SupportsLoRA
SupportsPP)LlamaMLP)AutoWeightsLoaderPPMissingLayerWeightsMapperextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)IntermediateTensors)AttentionTypec                   ^     e Zd Z	 	 	 ddedz  dedef fdZdej        d	ej        fd
Z	 xZ
S )AfmoeMoEN Fquant_configprefixenable_eplbc           	      v   t                                                       t                      | _        |j        | _        |j        | _        |j        | _        t                      j        | _	        | j	        
                                | _        | j	                                        | _        |j        | _        |j        | _        |j        dk    rt'          d|j         d          t)          j        |j        |j        dt.          j                  | _        t)          j        t/          j        |j        t.          j                            | _        t;                      }|j        j        }|| _         |j!        | _"        | j        | _#        | j#        | j"        z   | _$        | j$        | j        z  | _%        | j        | j%        z  | _&        | j&        | j%        z   | _'        d | _(        |j        dk    r6|j)        |j        z  }tU          |j        ||j        |d| d	          | _(        tW          di d
| j(        d|j        d|j,        d|j        d|j)        ddd| j        dk    r| j        ndd|ddd|j-        d|j.        d| dd| j        d| j        d| j        d| j         d| j"        | _/        d S )NsiluzUnsupported activation: z!. Only silu is supported for now.F)biasdtyper2   r   z.shared_experts)hidden_sizeintermediate_size
hidden_actr,   reduce_resultsr-   shared_expertsnum_expertstop_kr4   r5   r7   renormalizesigmoidr,   use_grouped_topkTnum_expert_group
topk_groupr-   z.expertsscoring_funcrouted_scaling_factore_score_correction_biasr.   num_redundant_experts )0super__init__r   tp_sizeroute_scale
score_func
route_normr   device_groupep_grouprankep_ranksizeep_sizer9   n_routed_expertsnum_shared_expertsn_shared_expertsr6   
ValueErrorr   Linearr4   torchfloat32gate	Parameteremptyexpert_biasr   parallel_configeplb_configr.   rC   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr8   moe_intermediate_sizeAfmoeMLPr   num_experts_per_tokn_groupr?   experts)	selfconfigr,   r-   r.   vllm_configr]   r5   	__class__s	           t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/afmoe.pyrF   zAfmoeMoE.__init__:   s8    	;==!- + +$3}))++}))++%+%7%+%>&&26+< 2 2 2   I-	
 
 
	 <K*%-@@@
 

 .//!1=&#.#D !%!6"&"84;S"S(,(?4<(O%%)\D4Q%Q"&)FF 	  #$q(( & <v?X X"*"."3!,)$ 111# # #D & 
 
 
..
**
 ,,
 **	

 %::
 !5
 ,0?i+G+GU
 &
 "T
 $^^
 ((
 &&&&
 
 #'"2"2
 %)$4$4
  ((!
" #'":":#
    hidden_statesreturnc                 t   |j         \  }}|                    d|          }|                     |                    t          j                            }|                     ||          }| j        |\  }}||z   }n|}| j        dk    r| j        	                    |          }|                    ||          S )Nr3   )ro   router_logits   )
shapeviewrX   torV   rW   rh   r8   rG   &maybe_all_reduce_tensor_model_parallel)ri   ro   
num_tokens
hidden_dimrs   fused_moe_outshared_outputfinal_hidden_statess           rm   forwardzAfmoeMoE.forward   s    !.!4
J%**2z::		-"2"2"2"G"GHH'} % 
 
 *1>.M."5"E"/<!"&,"U"U## # #''
J???rn   )Nr+   F)__name__
__module____qualname__r   strboolrF   rV   Tensorr~   __classcell__rl   s   @rm   r*   r*   9   s         37!S
 S
 )4/S
 	S

 S
 S
 S
 S
 S
 S
j@U\ @el @ @ @ @ @ @ @ @rn   r*   c                        e Zd Zddddddej        fdedededed	ed
edz  dededz  dedz  de	de	ddf fdZ
dej        dej        dej        fdZ xZS )AfmoeAttention   Ngh㈵>r+   	layer_idxr4   	num_headsnum_kv_headsmax_position_embeddingshead_dimrms_norm_epscache_configr,   r-   	attn_typerp   c                    t                                                       || _        || _        t	                      }|| _        | j        |z  dk    sJ | j        |z  | _        || _        | j        |k    r| j        |z  dk    sJ n|| j        z  dk    sJ t          d| j        |z            | _	        |p	|| j        z  | _
        | j        | j
        z  | _        | j	        | j
        z  | _        | j
        dz  | _        || _        |j        |         dk    | _        | j        r|j        nd | _        t%          | j        | j
        | j        | j        d|
| d          | _        t)          | j        | j
        z  | j        d|
| d          | _        t-          || j        | j
        z  d|
| d	          | _        t1          | j
        |j        
          | _        t1          | j
        |j        
          | _        | j        r#t9          | j
        ||j        d          | _        nd | _        t?          | j        | j
        | j        | j	        |	|
| j        | d|	  	        | _         d S )Nr   rt   g      sliding_attentionFz	.qkv_proj)r1   r,   r-   z.o_projz
.gate_projepsT)max_positionrope_parametersis_neox_stylez.attn)r   r   r,   per_layer_sliding_windowr-   r   )!rE   rF   r   r4   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizescalingr   layer_typesis_local_attentionsliding_windowr   qkv_projr   o_projr   	gate_projr   r   q_normk_normr   r   
rotary_embr   attn)ri   rj   r   r4   r   r   r   r   r   r   r,   r-   r   rG   rl   s                 rm   rF   zAfmoeAttention.__init__   s    	"&688(#g-2222-8"."g-- *W499999 T4499994#:g#EFF I[D4H%Hnt}4(4=8}d*'>$ #)"4Y"?CV"V7;7NXf33TX)M #%'''
 
 
 ( 4=0%%%%
 
 
 . 4=0%(((
 
 
 dm1DEEEdm1DEEE " 	#&4 & 6"	  DOO #DONML*%%%)%8###

 

 

			rn   	positionsro   c                    |                      |          \  }}|                     |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |                    d| j        | j                                                |j	                  }| 
                    |                    d| j        | j                                                |j	                  }| j        r!| j        |                     |||          \  }}|                     |||          }	|	t          j        |          z  }	|                     |	          \  }
}|
S )Nrr   )dim)r   r   splitr   r   r   reshaper   r   ru   r   r   r   r   r   rV   r<   r   )ri   r   ro   qkv_rX   qkvattn_outputoutputs              rm   r~   zAfmoeAttention.forward  s:   
 }--Q..//a))T[$,E2)NN1a KK		"dndmDDEEMMagVVKK		"d&7GGHHPPG
 

 " 	4t'B??9a33DAqii1a(( "EM$$7$77KK,,	rn   )r   r   r   r(   DECODERintfloatr	   r   r   rF   rV   r   r~   r   r   s   @rm   r   r      s,        (.##+/26&.^
 ^
 ^
 	^

 ^
 ^
 "%^
 *^
 ^
 "D(^
 )4/^
 ^
 ^
 
^
 ^
 ^
 ^
 ^
 ^
@< | 
	       rn   r   c                        e Zd Z	 	 	 	 ddedz  dedz  dededdf
 fd	Zd
ej	        dej	        dej	        dz  de
ej	        ej	        f         fdZ xZS )AfmoeDecoderLayerNr+   Fr   r,   r-   r.   rp   c                    t                                                       |j        | _        t          |dd          }t	          |          | _        t          || j        | j        |j        |j        ||j	        |j
        ||| d          | _        | j        |j        k    | _        | j        rt          ||| d|          | _        n+t!          |j        |j        |j        || d          | _        t'          |j        |j
                  | _        t'          |j        |j
                  | _        t'          |j        |j
                  | _        t'          |j        |j
                  | _        d S )	Nr   r   z
.self_attn)rj   r   r4   r   r   r   r   r   r   r,   r-   z.mlp)rj   r,   r-   r.   )r4   r5   r6   r,   r-   r   )rE   rF   r4   getattrr"   r   r   num_attention_headsnum_key_value_headsr   r   	self_attnnum_dense_layersmoe_enabledr*   mlpre   r5   r6   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernorm)ri   rj   r   r,   r-   r.   r   rl   s          rm   rF   zAfmoeDecoderLayer.__init__#  s    	!-")&2KV"T"T -V44'n(03$;_,%%(((
 
 
  >V-DD 	) '	  DHH  "."(":!,)   DH  'v'9v?RSSS(/F$7)
 )
 )
% "));AT!U!U!U")&*<&BU"V"V"Vrn   r   ro   residualc                 N   ||}|                      |          }n|                      ||          \  }}|                     ||          }|                     |          }|                     ||          \  }}|                     |          }|                     |          }||fS )N)r   ro   )r   r   r   r   r   r   )ri   r   ro   r   s       rm   r~   zAfmoeDecoderLayer.forwardZ  s     $H 00??MM&*&:&:=(&S&S#M8' ' 
 
 55mDD #'"8"88#
 #
x ////>>h&&rn   )NNr+   F)r   r   r   r	   r   r   r   rF   rV   r   tupler~   r   r   s   @rm   r   r   "  s         ,026!5W 5W "D(5W )4/	5W
 5W 5W 
5W 5W 5W 5W 5W 5Wn'<' |' ,%	'
 
u|U\)	*' ' ' ' ' ' ' 'rn   r   rr   )	input_idsr   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                   H    e Zd Zdddedef fdZdej        dej        fdZ	 	 ddej        d
ej        de	d	z  dej        d	z  dej        e	z  f
dZ
dedej        dej        de	fdZdeeeeeef                  fdZdeeeej        f                  dee         fdZ xZS )
AfmoeModelr+   r-   rk   r-   c                   t                                                       |j        j        |j        |j        |j        j        | _        j	        | _	        j
        | _
        t                      j        r%t          j	        j        | d          | _        nt!                      | _        t#          j        fd| d          \  | _        | _        | _        t                      j        r!t/          j        j                  | _        nt!                      | _        t5          ddgj                  | _        d S )Nz.embed_tokensr   c                 ,    t          |           S )N)rj   r   r,   r-   r.   )r   )r-   r   rj   r.   r,   s    rm   <lambda>z%AfmoeModel.__init__.<locals>.<lambda>  s&    ,))'   rn   z.layersr   ro   r   )rE   rF   model_config	hf_configr   r,   r\   r.   rj   
vocab_sizemup_enabledr   is_first_rankr   r4   embed_tokensr    r%   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   r   normr$   make_empty_intermediate_tensors)ri   rk   r-   r   rj   r.   r,   rl   s      @@@@rm   rF   zAfmoeModel.__init__  sf   )3"/"/!1= +!->>' 	1 6!6#5>V>V>V! ! !D !/ 0 0D8C$       %%%
9
 
9
 
9
5$.$+ >>& 	) 28KLLLDII&((DI/Vj)6+=0
 0
,,,rn   r   rp   c                 ,    |                      |          S N)r   ri   r   s     rm   embed_input_idszAfmoeModel.embed_input_ids  s      +++rn   Nr   r   r   c                    t                      j        r6||}n|                     |          }| j        r|| j        j        dz  z  }d }n|J |d         }|d         }t          | j        | j        | j	                  D ]} ||||          \  }}t                      j
        st          ||d          S |                     ||          \  }}|S )Ng      ?ro   r   ro   r   )r   r   r   r   rj   r4   r   r   r   r   r   r'   r   )	ri   r   r   r   r   ro   r   layerr   s	            rm   r~   zAfmoeModel.forward  s    >>' 	8( - $ 4 4Y ? ?  O -1H#1M NHH'3330AM+J7HDK)94>JJ 	P 	PE&+eI}h&O&O#M88~~* 	&"/XFF    99]H==qrn   
batch_sizer2   devicec                     t          t          j        || j        j        f||          t          j        || j        j        f||          d          S )N)r2   r   r   )r'   rV   zerosrj   r4   )ri   r   r2   r   s       rm   r   z*AfmoeModel.make_empty_intermediate_tensors  sl     #!&!89v" " " "K!89v  	 	
 	
 		
rn   c                 H    t          j        | ddd| j        j                  S )Nr   	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer9   )r   make_expert_params_mappingrj   r9   ri   s    rm   get_expert_mappingzAfmoeModel.get_expert_mapping  s2     8 + +'/
 
 
 	
rn   weightsc           
         g d}t          |                                           }t                      }|                                 }|D ]\  }}|D ]v\  }}	}
|	|vsd|v rd|v r||vr|                    |	|          }|                    d          r||vrHt          ||           rY||         }|j        } ||||
            nd}|D ]}|\  }}	}}
|	|vrd}|                    |	|          }t          ||           r7||         }t          j	        t          dt          f         |j                  } |||||
|d          }|r|} np|r
|                    d          r||vr%t          ||          }|9t          ||           rK||         }t          |d	t                    } |||           |                    |           |S )
N))r   q_projr   )r   k_projr   )r   v_projr   )gate_up_projr   r   )r   r   rt   zself_attn.gate_projzmlp.experts.z.biasFT.)shard_id	expert_idreturn_successweight_loader)dictnamed_parameterssetr   replaceendswithr#   r   typingcastr   r   r   r   r   add)ri   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   is_expert_weightmappingr   name_mappedsuccesss                     rm   load_weightszAfmoeModel.load_weights  s   "
 "
 "
 4002233"%%% $ 7 7 9 9#* U	$ U	$D-5K S8 S81
Kt++1F$1N1N #d**K0G0G||K<<==)) d+.E.E*466 #D) % 3e]H===#( 4 98 98GCJ@JY"$..  (,$ #',,{J"G"GK.{DAA ! '4E %+K d+U-@% %M ,m%#!)"+'+  G  * ( ! ! }}W-- !$k2I2I  5T;GGD| .tT:: ! '-E$+0E% %M "M%777d####rn   NN)r   r   r   r
   r   rF   rV   r   r   r'   r~   r   r2   r   r   listr   r   r   r  r  r   r   s   @rm   r   r   v  s        BD &
 &
 &
z &
3 &
 &
 &
 &
 &
 &
P, ,%, , , , , <@-1   <  <  2D8	 
 |d*  
+	+       D

&+k
;@<
	
 
 
 
	
DsCc/A)B$C 	
 	
 	
 	
eHU33D-E$F e3s8 e e e e e e e ern   r   c                       e Zd Zg dddgdZ eddi          ZdZd	d
dedef fdZ	de
j        de
j        de
j        ddfdZde
j        de
j        fdZ	 	 dde
j        de
j        dedz  de
j        dz  de
j        ez  f
dZde
j        de
j        dz  fdZdeeee
j        f                  dee         fdZdeeeeeef                  fdZ xZS ) AfmoeForCausalLM)r   r   r   r   r   )r   r   z.router.gate.weightz.gate.weight)orig_to_new_suffixFr+   r   rk   r-   c                   t                                                       |j        j        }|j        }|| _        || _        t          |t          |d                    | _        t                      j
        r"t          |j        |j        |          | _        nt                      | _        t!          |j                  | _        | j        j        | _        g | _        |j        |j        z
  | _        |j        | _        g | _        d }| j        j        D ]a}t7          |t                    rt7          |t8                    sJ |j        r+|j        }| j                            |j        j                    b|| j        dk    rtC          d          |J|j"        | _#        |j$        | _%        |j&        | _'        |j(        | _)        |j*        | _+        |j,        | _-        d S d S )Nmodel)rk   r-   )r,   r   z(No AfmoeMoE layer found in model.layers.).rE   rF   r   r   r,   rj   r   r&   r  r   r   r   r   r4   lm_headr    r   logits_processorr   expert_weightsr   r   num_moe_layersrg   num_expert_groups
moe_layersr   
isinstancer   r   r   appendrh   RuntimeErrorr_   num_logical_expertsr`   num_physical_expertsra   num_local_physical_expertsrQ   num_routed_expertsrS   rR   r^   rC   )ri   rk   r-   rj   r,   example_moer   rl   s          rm   rF   zAfmoeForCausalLM.__init__b  s   )3"/(#L,I,I
 
 

 >>& 	,)!6#5L  DLL *++DL /0A B BJ6 	, ! %69PP!'02Z& 	: 	:E%00 e%677777  :#i&&uy'89994#6#:#:IJJJ"'2'DD$(3(FD%.9.RD+&1&BD#&1&BD#)4)HD&&& #"rn   expert_load_viewlogical_to_physical_maplogical_replica_countrp   Nc                     t          | j                  D ]J\  }}| j                            |                                           |                    ||||           Kd S )N)moe_layer_idxr*  r+  r,  )	enumerater!  r  r#  get_expert_weightsset_eplb_state)ri   r*  r+  r,  r   r   s         rm   r1  zAfmoeForCausalLM.set_eplb_state  s}     !*$/ : : 	 	Iu&&u'?'?'A'ABBB  '!1(?&;	 !    	 	rn   r   c                 6    | j                             |          S r   )r  r   r   s     rm   r   z AfmoeForCausalLM.embed_input_ids  s    z)))444rn   r   r   r   c                 6    |                      ||||          }|S r   )r  )ri   r   r   r   r   ro   s         rm   r~   zAfmoeForCausalLM.forward  s)     

y"6
 
 rn   ro   c                 <    |                      | j        |          }|S r   )r  r  )ri   ro   logitss      rm   compute_logitszAfmoeForCausalLM.compute_logits  s    &&t|]CCrn   r   c                 X    t          |           }|                    || j                  S )N)mapper)r   r  hf_to_vllm_mapper)ri   r   loaders      rm   r  zAfmoeForCausalLM.load_weights  s+    "4((""743I"JJJrn   c                 4    | j                                         S r   )r  r   r   s    rm   r   z#AfmoeForCausalLM.get_expert_mapping  s    z,,...rn   r  )r   r   r   packed_modules_mappingr!   r9  fall_back_to_pt_during_loadr
   r   rF   rV   r   r1  r   r'   r~   r6  r   r   r  r  r  r   r   r   r   s   @rm   r  r  M  s       
 
 
 

 
 &!>
   #(AC -I -I -Iz -I3 -I -I -I -I -I -I^, "'  %|	
 
    5 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
EL U\D=P    KHU33D-E$F K3s8 K K K K/DsCc/A)B$C / / / / / / / /rn   r  )F__doc__r  collections.abcr   r   	itertoolsr   rV   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   r   vllm.distributedr   r   r   vllm.loggerr   5vllm.model_executor.layers.fused_moe.shared_fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   %vllm.model_executor.models.interfacesr   r    vllm.model_executor.models.llamar   re    vllm.model_executor.models.utilsr   r    r!   r"   r#   r$   r%   r&   vllm.sequencer'   vllm.v1.attention.backendr(   r   loggerModuler*   r   r   r   r  rD   rn   rm   <module>rU     s   F E  . . . . . . . .              * * * * * * = = = = = = H H H H H H H H H H         
 $ # # # # # P P P P P P 8 8 8 8 8 8         
 H G G G G G F F F F F F @ @ @ @ @ @               K J J J J J J J A A A A A A	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 . - - - - - 3 3 3 3 3 3	X		j@ j@ j@ j@ j@ry j@ j@ j@Zy y y y yRY y y yxQ' Q' Q' Q' Q'	 Q' Q' Q'h  !	   L L L L L L L L^l/ l/ l/ l/ l/ry*l l/ l/ l/ l/ l/rn   