
    .`i~R                         d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3  G d dej4                  Z5 G d dej4                  Z6 G d dej4                  Z7e G d dej4                              Z8 G d  d!ej4        e-e.          Z9dS )"z Inference-only GraniteMoe model.    )Iterable)islice)AnyN)nn)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)sequence_parallel_chunk)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parametermake_layersmaybe_prefixc                        e Zd ZdZ	 	 	 	 	 ddedededed	ej        dz  d
edz  dedz  def fdZ	dej
        dej
        fdZ xZS )GraniteMoeMoEa
  A tensor-parallel MoE implementation for GraniteMoe that shards each
    expert across all ranks.
    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    NF num_expertstop_khidden_sizeintermediate_sizeparams_dtypequant_configtp_sizeprefixc
                     t                                                       || _        || _        t	          ||d|d |	 d          | _        t          |||||dd|||	 d| j                  | _        d S )NFz.gate)biasr*   r+   r-   Tz.experts)r&   r'   r(   r)   r*   reduce_resultsrenormalizer+   r,   r-   is_sequence_parallel)super__init__r(   r2   r   gater   experts)selfr&   r'   r(   r)   r*   r+   r,   r2   r-   	__class__s             y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/granitemoe.pyr4   zGraniteMoeMoE.__init__K   s     	&$8! %%###
 
 
	  ##/%%&&&!%!:
 
 
    hidden_statesreturnc                 J   |j         }|                    d| j                  }| j        rt	          |          }|                     |          \  }}|                     ||          }| j        r"t          |d          }|d         }|d |         }|                    |          S )Nr   )shapeviewr(   r2   r   r5   r6   r   )r7   r;   
orig_shaperouter_logits_final_hidden_states
num_tokenss          r9   forwardzGraniteMoeMoE.forwards   s    "(
%**2t/?@@$ 	C3MBBM  99]33q"ll=-HH$ 	C"B#Q# # $AJ"5kzk"B"''
333r:   )NNNFr%   )__name__
__module____qualname____doc__inttorchdtyper   strr4   TensorrF   __classcell__r8   s   @r9   r$   r$   C   s          ,026""&
 &
&
 &
 	&

 &
 kD(&
 )4/&
 t&
 &
 &
 &
 &
 &
 &
P4U\ 4el 4 4 4 4 4 4 4 4r:   r$   c                        e Zd Z	 	 	 	 	 	 ddededededeeef         dz  d	edz  d
edz  de	dz  deddf fdZ
dej        dej        dej        fdZ xZS )GraniteMoeAttention   Nr%   r(   	num_headsnum_kv_headsmax_positionrope_parameterscache_configr+   attention_multiplierr-   r<   c
           
      <   t                                                       || _        t                      }
|| _        | j        |
z  dk    sJ | j        |
z  | _        || _        | j        |
k    r| j        |
z  dk    sJ n|
| j        z  dk    sJ t          d| j        |
z            | _        || j        z  | _	        | j        | j	        z  | _
        | j        | j	        z  | _        ||n	| j	        dz  | _        t          || j	        | j        | j        d||	 d          | _        t          | j        | j	        z  |d||	 d          | _        t#          | j	        ||d	          | _        t'          | j        | j	        | j        | j        |||	 d
          | _        d S )Nr   r   r>   Fz	.qkv_proj)r/   r+   r-   z.o_projT)rW   rX   is_neox_stylez.attn)rV   rY   r+   r-   )r3   r4   r(   r   total_num_headsrU   total_num_kv_headsmaxrV   head_dimq_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr   attn)r7   r(   rU   rV   rW   rX   rY   r+   rZ   r-   r,   r8   s              r9   r4   zGraniteMoeAttention.__init__   s    	&688(#g-2222-8"."g-- *W499999 T4499994#:g#EFF#t';;nt}4(4=8 $/ ! " 	 *M #%'''
 
 
 ( 4=0%%%%
 
 
 #M%+	
 
 
 NML*%%###
 
 
			r:   	positionsr;   c                 "   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )Nr>   dim)rd   splitra   rb   rf   rg   re   )
r7   rh   r;   qkvrC   qkvattn_outputoutputs
             r9   rF   zGraniteMoeAttention.forward   s    
 }--Q))T[$,E2)NN1ay!Q//1ii1a((KK,,	r:   )rT   NNNNr%   )rG   rH   rI   rK   dictrN   r   r	   r   floatr4   rL   rO   rF   rP   rQ   s   @r9   rS   rS      s        &15+/26-1C
 C
C
 C
 	C

 C
 c3h$.C
 "D(C
 )4/C
 $dlC
 C
 
C
 C
 C
 C
 C
 C
J
<
 |
 
	
 
 
 
 
 
 
 
r:   rS   c                   b     e Zd Z	 d
dededdf fdZdej        dej        dej        fd	Z xZ	S )GraniteMoeDecoderLayerr%   vllm_configr-   r<   Nc                 $   t                                                       |j        j        }|j        }|j        }|j        }|j        | _        t          | j        |j	        |j
        |j        |j        ||| d|j        	  	        | _        t          |j        |j        |j        |j        ||j        | d          | _        t+          |j        |j                  | _        t+          |j        |j                  | _        |j        | _        d S )Nz
.self_attn)	r(   rU   rW   rV   rX   rY   r+   r-   rZ   z.block_sparse_moe)r&   r'   r(   r)   r+   r2   r-   eps)r3   r4   model_config	hf_configrY   r+   parallel_configr(   rS   num_attention_headsmax_position_embeddingsnum_key_value_headsrX   rZ   	self_attnr$   num_local_expertsnum_experts_per_tokr)   use_sequence_parallel_moeblock_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplier)r7   rw   r-   configrY   r+   r}   r8   s          r9   r4   zGraniteMoeDecoderLayer.__init__   s-   
 	)3"/"/%5!-,(073"2%%(((!'!<

 

 

 !.0,*$6%!0!J///!
 !
 !
  'v'9v?RSSS(/F$7)
 )
 )
% $*#=   r:   rh   r;   c                     |}|                      |          }|                     ||          }||| j        z  z   }|}|                     |          }|                     |          }||| j        z  z   }|S )N)rh   r;   )r   r   r   r   r   )r7   rh   r;   residuals       r9   rF   zGraniteMoeDecoderLayer.forward  s     !,,];;' ' 
 
 !=43K#KK 55mDD--m<< =43K#KKr:   )r%   )
rG   rH   rI   r
   rN   r4   rL   rO   rF   rP   rQ   s   @r9   rv   rv      s         '> '>'> '> 
	'> '> '> '> '> '>R< | 
	       r:   rv   c                   *    e Zd Zdddedef fdZdej        dej        fdZ	 ddej        d
ej        de	d	z  dej        d	z  dej        f
dZ
deeeej        f                  dee         fdZdeeeej        f                  dee         fdZ xZS )GraniteMoeModelr%   r-   rw   r-   c                   t                                                       j        j        }j        }|| _        || _        |j        | _        t          | j        |j                  | _	        |j
        | _
        t          |j        fd| d          \  | _        | _        | _        t!          |j        |j                  | _        d S )Nc                 &    t          |           S )Nr   )rv   )r-   rw   s    r9   <lambda>z*GraniteMoeModel.__init__.<locals>.<lambda>0  s    1+fMMM r:   z.layersr   ry   )r3   r4   r{   r|   r+   r   
vocab_sizer   r(   embed_tokensembedding_multiplierr!   num_hidden_layersstart_layer	end_layerlayersr   r   normr7   rw   r-   r   r+   r8   s    `   r9   r4   zGraniteMoeModel.__init__  s    )3"/( +2O
 
 %+$?!8C$MMMM%%%9
 9
 9
5$.$+ F.F4GHHH			r:   	input_idsr<   c                 ,    |                      |          S N)r   r7   r   s     r9   embed_input_idszGraniteMoeModel.embed_input_ids6  s      +++r:   Nrh   intermediate_tensorsinputs_embedsc                 ^   t                      j        r%||}n|                     |          }|| j        z  }n|J |d         }t	          | j        | j        | j                  D ]} |||          }t                      j        st          d|i          S | 
                    |          }|S )Nr;   )r   is_first_rankr   r   r   r   r   r   is_last_rankr   r   )r7   r   rh   r   r   r;   layers          r9   rF   zGraniteMoeModel.forward9  s     >>' 	B( - $ 4 4Y ? ?T66MM'3330AMDK)94>JJ 	< 	<E!E)];;MM~~* 	&#]  
 		-00r:   weightsc           	      \   g d}t          j        | ddd| j        j                  }t	          |                                           }t                      }|D ]Q\  }}| j        ~| j                            |          x}rb||         }	t          |	dt                    }
|                                dk    r|n|d         } |
|	|           |                    |           |D ]\  }}}||vr|                    ||          }|                    d	          s|                    d
          r||vrPt          ||           ra|                    d          rt!          ||          }|||         }	|	j        }
 |
|	||            n|D ]}|\  }}}}||vr|                    ||          }t          ||           r5|                    d	          s|                    d
          r||vrd||         }	|	j        }
 |
|	||||            n|                    d	          s|                    d
          r||vrt          ||           rt!          ||          }|||         }	t          |	dt                    }
 |
|	|           |                    |           S|S )z
        This function is copied from `MixtralModel.load_weights`, mainly to
        decouple from mixtral, avoiding impact on support like BNB
        quantization.
        ))rd   q_projrn   )rd   k_projro   )rd   v_projrp   w1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer&   Nweight_loaderr   z.bias_biasscale)shard_id	expert_id)r   make_expert_params_mappingr   r   rs   named_parameterssetr+   get_cache_scalegetattrr   rk   addreplaceendswithr    r   r   )r7   r   stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_namer   mappingr   s                   r9   _load_weightszGraniteMoeModel._load_weightsT  sn   "
 "
 "
 !) C $ $"5!
 !
 !
 4002233"%%%#* M	$ M	$D- ,"/??EEE
 - $J/ '@U V V%2%6%6%8%8A%=%=MM=QRCS  e]333!!*---5K >8 >81
Kd**||K<< MM'**.2mmG.D.D+--*466 ==)) !4T;GGD| #D) % 3e]H===4 (8 (8GCJ@JY"$.. <<Z@@D.tT:: ! g..!26--2H2H!k11 '-E$)$7M!M%!)"+    E g..!26--2H2H!k11 .tT:: ! 4T;GGD| '-E$+0E% %M "M%777d####r:   c                    i }|D ]Q\  }}|                     d          rt          |                    d                    D ]l}|                    dd| d          }|                    dd| d          }||                             dd          \  }}	||vsJ ||vsJ |||<   |	||<   m|                     d          rTt          |                    d                    D ]/}|                    dd| d	          }
||         }|
|vsJ |||
<   0|                     d
          r#|                    d
d          }||vsJ |||<   L|||<   S|                     |                                          S )Nz%.block_sparse_moe.input_linear.weightr   z.block_sparse_moe.experts.z
.w1.weightz
.w3.weight   rj   z&.block_sparse_moe.output_linear.weightz
.w2.weightz%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weight)r   rangesizer   chunkr   items)r7   r   new_weightsnpew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_names                r9   load_weightszGraniteMoeModel.load_weights  s    !	# !	#DAqzzABB  #qvvayy)) 4 4Aii?BQBBB G  ii?BQBBB G *+1A1)=)=&Hh"+5555"+5555+3K(+3K((4 DEE #qvvayy)) 4 4Aii@BQBBB G  !tH"+5555+3K((4 CDD #II;3 	 !3333)*I&&!"A!!+"3"3"5"5666r:   r   )rG   rH   rI   r
   rN   r4   rL   rO   r   r   rF   r   tupler   r   r   rP   rQ   s   @r9   r   r     s^       AC I I Iz I3 I I I I I I2, ,%, , , , , .2 < < 2D8	
 |d* 
   6gXeC4E.F%G gCPSH g g g gR$7HU33D-E$F $73s8 $7 $7 $7 $7 $7 $7 $7 $7r:   r   c                   Z    e Zd ZdZdg diZdddZddd	ed
ef fdZde	j
        de	j
        fdZ	 	 dde	j
        de	j
        dedz  de	j
        dz  de	j
        f
dZde	j
        de	j
        dz  fdZdede	j        de	j        defdZdeeee	j
        f                  dee         fdZ xZS )GraniteMoeForCausalLMFrd   )r   r   r   input_embeddingsoutput_embeddings)r   lm_headr%   r   rw   r-   c          	         t                                                       |j        j        }|j        }|| _        t          |t          |d                    | _        t          |j
        |j        |t          |d                    | _        |j        r| j        j        j        | j        _        t!          |j
        d| j        j        z            | _        d S )Nmodel)rw   r-   r   )r+   r-   r   )r   )r3   r4   r{   r|   r+   r   r   r"   r   r   r   r(   r   tie_word_embeddingsr   weightr   logits_scalinglogits_processorr   s        r9   r4   zGraniteMoeForCausalLM.__init__  s    )3"/$#L,I,I
 
 

 &%	22	
 
 
 % 	A"&*"9"@DL /dk00!
 !
 !
r:   r   r<   c                 6    | j                             |          S r   )r   r   r   s     r9   r   z%GraniteMoeForCausalLM.embed_input_ids  s    z)))444r:   Nrh   r   r   c                 6    |                      ||||          }|S r   )r   )r7   r   rh   r   r   r;   s         r9   rF   zGraniteMoeForCausalLM.forward  s)     

y"6
 
 r:   r;   c                 <    |                      | j        |          }|S r   )r   r   )r7   r;   logitss      r9   compute_logitsz$GraniteMoeForCausalLM.compute_logits  s    &&t|]CCr:   
batch_sizerM   devicec                 f    t          dt          j        || j        j        f||          i          S )Nr;   )rM   r   )r   rL   zerosr   r(   )r7   r   rM   r   s       r9   make_empty_intermediate_tensorsz5GraniteMoeForCausalLM.make_empty_intermediate_tensors!  sB     #!89v" " "
 
 	
r:   r   c                 l    t          | | j        j        rdgnd           }|                    |          S )Nzlm_head.)skip_prefixes)r   r   r   r   )r7   r   loaders      r9   r   z"GraniteMoeForCausalLM.load_weights,  sC    "+/;+JTJ<<PT
 
 
 ""7+++r:   )NN)rG   rH   rI   fall_back_to_pt_during_loadpacked_modules_mappingembedding_modulesr
   rN   r4   rL   rO   r   r   rF   r   rK   rM   r   r   r   r   r   r   rP   rQ   s   @r9   r   r     s       "' 	 
 
 
 +& 
 BD 
 
 
z 
3 
 
 
 
 
 
25 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 

 
 
 
EL U\D=P    	
	
&+k	
;@<	
		
 	
 	
 	
,HU33D-E$F ,3s8 , , , , , , , ,r:   r   ):rJ   collections.abcr   	itertoolsr   typingr   rL   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.utilsr   vllm.sequencer   
interfacesr   r   utilsr   r    r!   r"   Moduler$   rS   rv   r   r    r:   r9   <module>r     s-  2 ' & $ $ $ $ $ $                    * * * * * * = = = = = = / / / / / / / /         
 : 9 9 9 9 9 8 8 8 8 8 8         
 H G G G G G F F F F F F @ @ @ @ @ @               E D D D D D - - - - - - 0 0 0 0 0 0 0 0 X X X X X X X X X X X XC4 C4 C4 C4 C4BI C4 C4 C4LP P P P P") P P Pf< < < < <RY < < <~ E7 E7 E7 E7 E7bi E7 E7 E7PM, M, M, M, M,BI|Z M, M, M, M, M,r:   