
    .`i.                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*  G d dej+                  Z, G d dej+                  Z-e
 G d dej+                              Z. G d dej+        e%e&          Z/dS )z|Inference-only GraniteMoeShared model.

The architecture is the same as granitemoe but with the addition of shared
experts.
    )Iterable)isliceN)nn)GraniteMoeSharedConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group)
SiluAndMul)RMSNorm)MergedColumnParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)IntermediateTensors   )GraniteMoeAttentionGraniteMoeModelGraniteMoeMoE)SupportsLoRA
SupportsPP)AutoWeightsLoadermake_layersmaybe_prefixc                   \     e Zd Z	 	 d
dededz  def fdZdej        dej        fd	Z	 xZ
S )GraniteMoeSharedMLPN configquant_configprefixc                    t                                                       |j        | _        |j        | _        t          | j        | j        gdz  d|| d          | _        t          | j        | j        d|| d          | _        |j	        dk    rt          d|j	         d	          t                      | _        d S )
N   Fz.input_linear)
input_sizeoutput_sizesbiasr!   r"   z.output_linear)r'   r!   r"   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__hidden_sizer%   shared_intermediate_sizer   input_linearr   output_linear
hidden_act
ValueErrorr   act_fn)selfr    r!   r"   	__class__s       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/granitemoeshared.pyr*   zGraniteMoeSharedMLP.__init__'   s     	 ,!:6*+a/%+++
 
 
 /O%,,,
 
 
 &&26+< 2 2 2   !ll    hidden_statesreturnc                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r-   r1   r.   )r2   r6   _s      r4   forwardzGraniteMoeSharedMLP.forwardF   sI    ,,];;qM22--m<<qr5   )Nr   )__name__
__module____qualname__r   r   strr*   torchTensorr;   __classcell__r3   s   @r4   r   r   &   s         37	# #&# )4/# 	# # # # # #>U\ el        r5   r   c                   z     e Zd Z	 	 	 ddededz  dedz  deddf
 fdZd	ej	        d
ej	        dej	        fdZ
 xZS )GraniteMoeSharedDecoderLayerNr   r    cache_configr!   r"   r7   c                 8   t                                                       |j        | _        t          | j        |j        |j        |j        |j        ||| d|j        	  	        | _	        t          |j        |j        |j        |j        || d          | _        t          |dd          dk    rd nt!          ||| d          | _        t%          |j        |j        	          | _        t%          |j        |j        	          | _        |j        | _        d S )
Nz
.self_attn)	r+   	num_headsmax_positionnum_kv_headsrope_parametersrF   r!   r"   attention_multiplierz.block_sparse_moe)num_expertstop_kr+   intermediate_sizer!   r"   r,   r   z.shared_mlpr!   r"   eps)r)   r*   r+   r   num_attention_headsmax_position_embeddingsnum_key_value_headsrK   rL   	self_attnr   num_local_expertsnum_experts_per_tokrO   block_sparse_moegetattrr   
shared_mlpr   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplier)r2   r    rF   r!   r"   r3   s        r4   r*   z%GraniteMoeSharedDecoderLayer.__init__N   sN    	!-,(073"2%%(((!'!<

 

 

 !.0,*$6%///!
 !
 !
 v91==BB D$\V:P:P:P   	  'v'9v?RSSS(/F$7)
 )
 )
% $*#=   r5   	positionsr6   c                    |}|                      |          }|                     ||          }||| j        z  z   }|}|                     |          }| j        |                     |          }nB|                                }|                     |          }||                     |          z   }~||| j        z  z   }|S )N)r`   r6   )r]   rV   r_   r^   r[   rY   clone)r2   r`   r6   residualmoe_hidden_statess        r4   r;   z$GraniteMoeSharedDecoderLayer.forwardy   s     !,,];;' ' 
 
 !=43K#KK 55mDD?" 11-@@MM !. 3 3 5 5 $ 5 56G H H-0N0NNM! =43K#KKr5   )NNr   )r<   r=   r>   r   r   r   r?   r*   r@   rA   r;   rB   rC   s   @r4   rE   rE   M   s         ,026)> )>&)> "D()> )4/	)>
 )> 
)> )> )> )> )> )>V< | 
	       r5   rE   c                        e Zd Zdddedef fdZdej        dej        fdZ	 ddej        d
ej        de	d	z  dej        d	z  dej        f
dZ
deeeej        f                  dee         fdZ xZS )GraniteMoeSharedModelr   r"   vllm_configr"   c                   t                                                       |j        j        |j        |j        | _        | _        j        | _        j	        | _	        t          | j	        j                  | _        j        | _        t          j        fd| d          \  | _        | _        | _        t'          j        j                  | _        d S )N)r!   c                 *    t          |           S )NrP   )rE   )r"   rF   r    r!   s    r4   <lambda>z0GraniteMoeSharedModel.__init__.<locals>.<lambda>   s     7<   r5   z.layersrg   rQ   )r)   r*   model_config	hf_configrF   r!   r    pad_token_idpadding_idx
vocab_sizer   r+   embed_tokensembedding_multiplierr   num_hidden_layersstart_layer	end_layerlayersr   r\   norm)r2   rh   r"   rF   r    r!   r3   s      @@@r4   r*   zGraniteMoeSharedModel.__init__   s   )3"/"/(!. +2O%
 
 

 %+$?!8C$      %%%9
 9
 9
5$.$+ F.F4GHHH			r5   	input_idsr7   c                 ,    |                      |          S r9   )rq   r2   rx   s     r4   embed_input_idsz%GraniteMoeSharedModel.embed_input_ids   s      +++r5   Nr`   intermediate_tensorsinputs_embedsc                 ^   t                      j        r%||}n|                     |          }|| j        z  }n|J |d         }t	          | j        | j        | j                  D ]} |||          }t                      j        st          d|i          S | 
                    |          }|S )Nr6   )r
   is_first_rankr{   rr   r   rv   rt   ru   is_last_rankr   rw   )r2   rx   r`   r|   r}   r6   layers          r4   r;   zGraniteMoeSharedModel.forward   s     >>' 	B( - $ 4 4Y ? ?T66MM'3330AMDK)94>JJ 	< 	<E!E)];;MM~~* 	&#]  
 		-00r5   weightsc                    i }|D ]Q\  }}|                     d          rt          |                    d                    D ]l}|                    dd| d          }|                    dd| d          }||                             dd          \  }}	||vsJ ||vsJ |||<   |	||<   m|                     d          rTt          |                    d                    D ]/}|                    dd| d	          }
||         }|
|vsJ |||
<   0|                     d
          r#|                    d
d          }||vsJ |||<   L|||<   St          j        | |                                          S )Nz%.block_sparse_moe.input_linear.weightr   z.block_sparse_moe.experts.z
.w1.weightz
.w3.weightr$   )dimz&.block_sparse_moe.output_linear.weightz
.w2.weightz%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weight)endswithrangesizereplacechunkr   _load_weightsitems)r2   r   new_weightsnpew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_names                r4   load_weightsz"GraniteMoeSharedModel.load_weights   s    !	# !	#DAqzzABB  #qvvayy)) 4 4Aii?BQBBB G  ii?BQBBB G *+1A1)=)=&Hh"+5555"+5555+3K(+3K((4 DEE #qvvayy)) 4 4Aii@BQBBB G  !tH"+5555+3K((4 CDD #II;3 	 !3333)*I&&!"A,T;3D3D3F3FGGGr5   r9   )r<   r=   r>   r	   r?   r*   r@   rA   r{   r   r;   r   tuplesetr   rB   rC   s   @r4   rf   rf      s)       AC I I Iz I3 I I I I I I<, ,%, , , , , .2 < < 2D8	
 |d* 
   6$HHU33D-E$F $H3s8 $H $H $H $H $H $H $H $Hr5   rf   c                   Z    e Zd ZdZdg diZdddZddd	ed
ef fdZde	j
        de	j
        fdZ	 	 dde	j
        de	j
        dedz  de	j
        dz  de	j
        f
dZde	j
        de	j
        dz  fdZdede	j        de	j        defdZdeeee	j
        f                  dee         fdZ xZS )GraniteMoeSharedForCausalLMFqkv_proj)q_projk_projv_projinput_embeddingsoutput_embeddings)rq   lm_headr   rg   rh   r"   c          	         t                                                       |j        j        }|j        }|| _        t          |t          |d                    | _        t          |j
        |j        |t          |d                    | _        |j        r| j        j        j        | j        _        t!          |j
        |j
        d| j        j        z            | _        d S )Nmodel)rh   r"   r   rP   r   )scale)r)   r*   rl   rm   r!   r    rf   r   r   r   rp   r+   r   tie_word_embeddingsrq   weightr   logits_scalinglogits_processor)r2   rh   r"   r    r!   r3   s        r4   r*   z$GraniteMoeSharedForCausalLM.__init__  s    )3"/*#L,I,I
 
 

 &%	22	
 
 
 % 	A"&*"9"@DL /dk00!
 !
 !
r5   rx   r7   c                 6    | j                             |          S r9   )r   r{   rz   s     r4   r{   z+GraniteMoeSharedForCausalLM.embed_input_ids%  s    z)))444r5   Nr`   r|   r}   c                 6    |                      ||||          }|S r9   )r   )r2   rx   r`   r|   r}   r6   s         r4   r;   z#GraniteMoeSharedForCausalLM.forward(  s)     

y"6
 
 r5   r6   c                 <    |                      | j        |          }|S r9   )r   r   )r2   r6   logitss      r4   compute_logitsz*GraniteMoeSharedForCausalLM.compute_logits4  s    &&t|]CCr5   
batch_sizedtypedevicec                 f    t          dt          j        || j        j        f||          i          S )Nr6   )r   r   )r   r@   zerosr    r+   )r2   r   r   r   s       r4   make_empty_intermediate_tensorsz;GraniteMoeSharedForCausalLM.make_empty_intermediate_tensors8  sB     #!89v" " "
 
 	
r5   r   c                 l    t          | | j        j        rdgnd           }|                    |          S )Nzlm_head.)skip_prefixes)r   r    r   r   )r2   r   loaders      r4   r   z(GraniteMoeSharedForCausalLM.load_weightsC  sC    "+/;+JTJ<<PT
 
 
 ""7+++r5   )NN)r<   r=   r>   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr	   r?   r*   r@   rA   r{   r   r;   r   intr   r   r   r   r   r   r   rB   rC   s   @r4   r   r      s       "' 	 
 
 
 +& 
 BD 
 
 
z 
3 
 
 
 
 
 
45 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 

 
 
 
EL U\D=P    	
	
&+k	
;@<	
		
 	
 	
 	
,HU33D-E$F ,3s8 , , , , , , , ,r5   r   )0__doc__collections.abcr   	itertoolsr   r@   r   $transformers.models.granitemoesharedr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   vllm.sequencer   
granitemoer   r   r   
interfacesr   r   utilsr   r   r   Moduler   rE   rf   r    r5   r4   <module>r      s    % $ $ $ $ $              G G G G G G = = = = = = / / / / / / / / ) ) ) ) ) ) < < < < < < 8 8 8 8 8 8        H G G G G G F F F F F F        . - - - - - K K K K K K K K K K 0 0 0 0 0 0 0 0 ? ? ? ? ? ? ? ? ? ?$ $ $ $ $") $ $ $NE E E E E29 E E EP aH aH aH aH aHBI aH aH aHHN, N, N, N, N,")\: N, N, N, N, N,r5   