
    .`i[                     h   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/  G d de          Z0 G d dej1        j2                  Z3d+dZ4dej5        dej5        de6de7fd Z8 G d! d"ej9                  Z: G d# d$ej9                  Z; G d% d&ej9                  Z<e G d' d(ej9                              Z= G d) d*ej9        e(e)          Z>dS ),zInference-only PhiMoE model.    )Iterable)isliceN)nn)PretrainedConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)FusedMoE)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   ^     e Zd ZdZdgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )PhiMoEConfigphimoepast_key_values }      8         Nsilu   {Gz?h㈵>Tr      F           MbP?c                    || _         |	| _        || _        || _        || _        || _        || _        || _        || _        ||}|||z  }|| _	        || _
        || _        |
| _        || _        || _        ||                    dd          }d|d}|| _        || _        || _        || _        || _        || _         t-                      j        d||||d| d S )N
rope_thetag    .Adefault)	rope_typer3   )pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headssliding_windowattention_biaslm_head_biasnum_key_value_headshead_dim
hidden_actinitializer_rangerms_norm_eps	use_cachepopattention_dropoutnum_experts_per_toknum_local_expertsoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noisesuper__init__)selfr;   r=   r>   r?   r@   rD   rE   rF   r<   rG   rH   rI   r6   r7   r8   r9   rope_parametersrA   rK   rL   rM   rN   rO   rP   rB   rC   kwargsr3   	__class__s                                u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/phimoe.pyrR   zPhiMoEConfig.__init__G   s1   < %'>$&!2!2#6 ,,(&"5"&99H#6  $!2(""L#66J,5ZPPO!2#6 !2$8!$8!#6  	
%%% 3		
 	

 	
 	
 	
 	
 	
    )r%   r&   r'   r(   r(   r)   Nr*   r+   r,   r-   TNr   r.   FNNr/   r.   r0   Fr1   r/   FF)__name__
__module____qualname__
model_typekeys_to_ignore_at_inferencerR   __classcell__rV   s   @rW   r"   r"   C   s        J#4"5  )!""7C
 C
 C
 C
 C
 C
 C
 C
 C
 C
rX   r"   c                       e Zd Zedej        dej        dej        dej        dej        f
d            Zedej        fd            Zd	S )
mpscores
multiplierselected_expertsmasked_gatesmask_for_onec                 :    |                      |||           ||z  S N)save_for_backward)ctxrb   rc   rd   re   rf   s         rW   forwardz
mp.forward   s(     	j*:LIIIL((rX   grad_at_outputc                     | j         \  }}}||z  }||                    d          z  }|                    d||           |d d d d fS )N)dimindexsrc)saved_tensorsmulscatter_add_)rj   rl   rc   rd   re   grad_at_scores_expandeds         rW   backwardzmp.backward   sx    
 695F2
$l'*4".1C1CB1G1G"G,," 	- 	
 	
 	
 $
 	
rX   N)rY   rZ   r[   staticmethodtorchTensorrk   rv   r:   rX   rW   ra   ra      s        	)	) L	)  ,		)
 l	) l	) 	) 	) \	) 

 
 
 \
 
 
rX   ra   {Gz?c                    t          j                    5  |                     dd          \  }}|                                                     |          }|| z
  |z  d|z  k    }d d d            n# 1 swxY w Y   |                     |t          d                    }|}t          j        |d          }|                    d|          }|}t          j	        | d|t          d                    }	t          j                    5  |	                    dd          \  }}|                                                     |          }|| z
  |z  d|z  k    }d d d            n# 1 swxY w Y   |	                    |t          d                    }
|}t          j        |
d          }
|
                    d|          }t          j
        ||fd          }t          j
        ||fd          }||fS )	Nrn   T)ro   keepdim)minr.   z-infro   )ro   rp   )rx   no_gradmaxabsclampmasked_fillfloatsoftmaxgatherscatterconcat)rb   
jitter_epsmask_logits_thresholdmax_indfactorre   rd   multiplier_orc   masked_scoresmasked_gates_top2selected_experts_top2multiplier_top2s                rW   sparsemixerr      s    
 
 
)/D)I)I&w##(=#>>"7&"@F!J
N!
	
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 %%&;U6]]KKL =2666L&&25E&FFLJ M
f	 M 
 
 
)6):):r4):)P)P&w##(=#>>"7&"@F!J
N!
	
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 &112GvWW#&7R@@@'..2=R.SSOz?;DDDJ|%57L$MSUVVV 	 s%   AA22A69A6AE&&E*-E*hidden_statesgating_outputtopkrenormalizec                     | j         d         |j         d         k    s
J d            |dk    s
J d            |du s
J d            t          |          \  }}||fS )Nr   zNumber of tokens mismatchr.   zOnly top-2 routing is supportedFz Renormalization is not supported)shaper   )r   r   r   r   topk_weightstopk_idss         rW   phimoe_routing_functionr      sy     q!]%8%;;;;=X;;;19997999%!C(77L(!!rX   c                        e Zd ZdZ	 	 	 	 ddededededej        dz  d	edz  d
edz  def fdZ	dej
        dej
        fdZ xZS )PhiMoEa  A tensor-parallel MoE implementation for PhiMoE that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_kr=   r>   params_dtypequant_configtp_sizeprefixc	                     t                                                       || _        t          ||d|d | d          | _        t          |||||dd||t          | d          | _        d S )NFz.gate)biasr   r   r   Tz.experts)r   r   r=   r>   r   reduce_resultsr   r   r   custom_routing_functionr   )rQ   rR   r=   r   gater   r   experts)
rS   r   r   r=   r>   r   r   r   r   rV   s
            rW   rR   zPhiMoE.__init__   s     	& %%###
 
 
	  ##/%%$;&&&
 
 
rX   r   returnc                     |j         }|                    d| j                  }|                     |          \  }}|                     ||          }|                    |          S )Nrn   )r   viewr=   r   r   )rS   r   
orig_shaperouter_logits_final_hidden_statess         rW   rk   zPhiMoE.forward$  s`    "(
%**2t/?@@99]33q"ll=-HH"''
333rX   )NNNr   )rY   rZ   r[   __doc__intrx   dtyper   strrR   ry   rk   r^   r_   s   @rW   r   r      s          ,026"$
 $
$
 $
 	$

 $
 kD($
 )4/$
 t$
 $
 $
 $
 $
 $
 $
L4U\ 4el 4 4 4 4 4 4 4 4rX   r   c                        e Zd Z	 	 	 	 	 ddedededededz  d	ed
edz  dedz  deddf fdZde	j
        de	j
        de	j
        fdZ xZS )PhiMoEAttentionNr+   r   r=   	num_headsnum_kv_headsrT   rE   max_positioncache_configr   r   r   c
           
      2   t                                                       || _        t                      }
|| _        | j        |
z  dk    sJ | j        |
z  | _        || _        | j        |
k    r| j        |
z  dk    sJ n|
| j        z  dk    sJ t          d| j        |
z            | _        |||z  }|| _	        | j        | j	        z  | _
        | j        | j	        z  | _        | j	        dz  | _        t          || j	        | j        | j        d||	 d          | _        t          | j        | j	        z  |d||	 d          | _        t#          | j	        ||d          | _        t'          | j        | j	        | j        | j        |||	 d	
          | _        d S )Nr   r   g      Tz	.qkv_proj)r   r   r   z.o_proj)r   rT   is_neox_stylez.attn)r   r   r   r   )rQ   rR   r=   r   total_num_headsr   total_num_kv_headsr   r   rE   q_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr   attn)rS   r=   r   r   rT   rE   r   r   r   r   r   rV   s              rW   rR   zPhiMoEAttention.__init__/  s    	&688(#g-2222-8"."g-- *W499999 T4499994#:g#EFF"i/H nt}4(4=8}d*)M #%'''
 
 
 ( 4=0%%%%
 
 
 #M%+	
 
 
 NML*%%###
 
 
			rX   	positionsr   c                 "   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )Nrn   r~   )r   splitr   r   r   r   r   )
rS   r   r   qkvr   qkvattn_outputoutputs
             rW   rk   zPhiMoEAttention.forwardr  s    
 }--Q))T[$,E2)NN1ay!Q//1ii1a((KK,,	rX   )Nr+   NNr   )rY   rZ   r[   r   dictr	   r   r   rR   rx   ry   rk   r^   r_   s   @rW   r   r   .  s         $%+/26A
 A
A
 A
 	A

 A
 *A
 A
 "D(A
 )4/A
 A
 
A
 A
 A
 A
 A
 A
F
<
 |
 
	
 
 
 
 
 
 
 
rX   r   c                        e Zd Z	 	 	 ddededz  dedz  deddf
 fdZd	ej	        d
ej	        dej	        dz  dej	        fdZ
 xZS )PhiMoEDecoderLayerNr   configr   r   r   r   c                    t                                                       |j        | _        t          | j        |j        |j        |j        t          |d| j        |j        z            |||j        | d	  	        | _	        t          |j        |j        |j        |j        || d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        d S )NrE   z
.self_attn)	r=   r   r   r   rE   r   r   rT   r   z.block_sparse_moe)r   r   r=   r>   r   r   Tepselementwise_affine)rQ   rR   r=   r   r@   r<   rD   getattrrT   	self_attnr   rM   rL   r>   block_sparse_moer   	LayerNormrH   input_layernormpost_attention_layernorm)rS   r   r   r   r   rV   s        rW   rR   zPhiMoEDecoderLayer.__init__  s#    	!-((073
D$48R$R  &%"2(((
 
 
 !'0,*$6%///!
 !
 !
  "|F$7D 
  
  
 )+F$7D)
 )
 )
%%%rX   r   r   residualc                     |}|                      |          }|                     ||          }||z   }|}|                     |          }|                     |          }||z   }||fS )N)r   r   )r   r   r   r   )rS   r   r   r   s       rW   rk   zPhiMoEDecoderLayer.forward  s     ! ,,];;' ' 
 
 &0 !55mDD--m<<%0h&&rX   )NNr   )rY   rZ   r[   r"   r	   r   r   rR   rx   ry   rk   r^   r_   s   @rW   r   r     s         ,026$
 $
$
 "D($
 )4/	$

 $
 
$
 $
 $
 $
 $
 $
L'<' |' ,%	'
 
' ' ' ' ' ' ' 'rX   r   c                       e Zd Zdddedef fdZdej        dej        fdZ	 ddej        d
ej        de	d	z  dej        d	z  dej        e	z  f
dZ
deeeeeef                  fdZdeeeej        f                  dee         fdZ xZS )PhiMoEModelr   r   vllm_configr   c                   t                                                       |j        j        |j        |j        j        | _        | _        | _        t          | j        j	                  | _
        t          j        fd| d          \  | _        | _        | _        t!          j        j	        j        d          | _        t)          ddgj	                  | _        d S )Nc                 *    t          |           S )Nr   )r   )r   r   r   r   s    rW   <lambda>z&PhiMoEModel.__init__.<locals>.<lambda>  s     -l6   rX   z.layersr   Tr   r   r   )rQ   rR   model_config	hf_configr   r   r;   r   r   r=   embed_tokensr   r?   start_layer	end_layerlayersr   r   rH   normr   make_empty_intermediate_tensors)rS   r   r   r   r   r   rV   s      @@@rW   rR   zPhiMoEModel.__init__  s   )3"/"/ +(2O
 
 9D$      %%%9
 9
 9
5$.$+ LF$7D
 
 
	 0Wj)6+=0
 0
,,,rX   	input_idsr   c                 ,    |                      |          S rh   )r   rS   r   s     rW   embed_input_idszPhiMoEModel.embed_input_ids  s      +++rX   Nr   intermediate_tensorsinputs_embedsc                 h   t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        | j        | j                  D ]} ||||          \  }}t                      j        st          ||d          S | 	                    |          }|S )Nr   r   )r   r   )
r   is_first_rankr   r   r   r   r   is_last_rankr   r   )rS   r   r   r   r   r   r   layers           rW   rk   zPhiMoEModel.forward  s     >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7HDK)94>JJ 	 	E&+e' '#M88 ~~* 	&"/XFF   		-00rX   c                 H    t          j        | ddd| j        j                  S )Nw1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   )r   make_expert_params_mappingr   rM   rS   s    rW   get_expert_mappingzPhiMoEModel.get_expert_mapping  s0    2 $ $"5
 
 
 	
rX   weightsc           	      :   g d}t          |                                           }t                      }|                                 }|D ]\  }}| j        ~| j                            |          x}rb||         }	t          |	dt                    }
|                                dk    r|n|d         } |
|	|           |	                    |           |D ]i\  }}}||vr|
                    ||          }|                    d          r||vr;t          ||           rL||         }	|	j        }
 |
|	||            n|D ]U}|\  }}}}||vr|
                    ||          }t          ||           r5||         }	|	j        }
 |
|	||||            nk|                    d          r||vrjt          ||           r|t          ||          }|||         }	t          |	dt                    }
 |
|	|           |	                    |           |S )N))r   q_projr   )r   k_projr   )r   v_projr   weight_loaderr   z.bias)shard_id	expert_id)r   named_parameterssetr  r   get_cache_scaler   r   ro   addreplaceendswithr   r  r   )rS   r  stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
scale_nameparamr  
param_nameweight_namer	  mappingr
  s                   rW   load_weightszPhiMoEModel.load_weights  s   "
 "
 "
 4002233"%%% $ 7 7 9 9#* @	$ @	$D- ,"/??EEE
 - $J/ '@U V V%2%6%6%8%8A%=%=MM=QRCS  e]333!!*---5K 18 181
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H===4 "8 "8GCJ@JY"$.. <<Z@@D.tT:: ! '-E$)$7M!M%!)"+    E }}W-- !$k2I2I .tT:: ! 4T;GGD| '-E$+0E% %M "M%777d####rX   rh   )rY   rZ   r[   r
   r   rR   rx   ry   r   r   rk   listtupler   r  r   r  r  r^   r_   s   @rW   r   r     sT       AC 
 
 
z 
3 
 
 
 
 
 
>, ,%, , , , , .2 < < 2D8	
 |d* 
+	+   B
DsCc/A)B$C 
 
 
 
LHU33D-E$F L3s8 L L L L L L L LrX   r   c                   Z    e Zd ZdZdg diZdddZddd	ed
ef fdZde	j
        de	j
        fdZ	 	 dde	j
        de	j
        dedz  de	j
        dz  de	j
        ez  f
dZde	j
        de	j
        fdZdeeee	j
        f                  dee         fdZdeeeeeef                  fdZ xZS )PhiMoEForCausalLMFr   )r  r  r  input_embeddingsoutput_embeddings)r   lm_headr   r   r   r   c          
         t                                                       |j        j        }|| _        |j        | _        t          |t          |d                    | _        t          |j
        |j        d dt          |d                    | _        t          |j
                  | _        | j        j        | _        d S )Nmodel)r   r   Tr#  )r   r   r   )rQ   rR   r   r   r   r   r   r    r%  r   r;   r=   r#  r   logits_processorr   )rS   r   r   r   rV   s       rW   rR   zPhiMoEForCausalLM.__init__n  s    )3'4 #L,I,I
 
 

 &	22
 
 
 !00A B B J6 	,,,rX   r   r   c                 6    | j                             |          S rh   )r%  r   r   s     rW   r   z!PhiMoEForCausalLM.embed_input_ids  s    z)))444rX   Nr   r   r   c                 6    |                      ||||          }|S rh   )r%  )rS   r   r   r   r   r   s         rW   rk   zPhiMoEForCausalLM.forward  s)     

y"6
 
 rX   r   c                 <    |                      | j        |          }|S rh   )r&  r#  )rS   r   logitss      rW   compute_logitsz PhiMoEForCausalLM.compute_logits  s    &&t|]CCrX   r  c                 J    t          |           }|                    |          S rh   )r   r  )rS   r  loaders      rW   r  zPhiMoEForCausalLM.load_weights  s#    "4((""7+++rX   c                 4    | j                                         S rh   )r%  r  r  s    rW   r  z$PhiMoEForCausalLM.get_expert_mapping  s    z,,...rX   )NN)rY   rZ   r[   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr
   r   rR   rx   ry   r   r   rk   r+  r   r  r  r  r  r   r  r^   r_   s   @rW   r   r   ]  s       "' 	 
 
 
 +& 
 BD 
 
 
z 
3 
 
 
 
 
 
25 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
EL U\    ,HU33D-E$F ,3s8 , , , ,/DsCc/A)B$C / / / / / / / /rX   r   )rz   )?r   collections.abcr   	itertoolsr   rx   r    transformers.configuration_utilsr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   $vllm.model_executor.layers.fused_moer   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r    r"   autogradFunctionra   r   ry   r   boolr   Moduler   r   r   r   r   r:   rX   rW   <module>rG     s  2 # " $ $ $ $ $ $              = = = = = = * * * * * * = = = = = = / / / / / / / / O O O O O O O O 9 9 9 9 9 9         
 H G G G G G F F F F F F @ @ @ @ @ @               . - - - - - 0 0 0 0 0 0 0 0             G
 G
 G
 G
 G
# G
 G
 G
T#
 #
 #
 #
 #
	  #
 #
 #
L1 1 1 1h"<"<" " 	" " " "64 64 64 64 64RY 64 64 64rN N N N Nbi N N Nb>' >' >' >' >' >' >' >'B Y Y Y Y Y") Y Y YxB/ B/ B/ B/ B/	< B/ B/ B/ B/ B/rX   