
    .`ie                     >   d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<  G d dej=                  Z> G d dej=                  Z? G d dej=                  Z@e?e>d ZAe
 G d! d"ej=                              ZB G d# d$ej=        e1e3e5e2e6e4	  	        ZCdS )%z&Inference-only GraniteMoeHybrid model.    )IterableN)nn)GraniteMoeHybridConfig)	Attention)support_torch_compile)CacheConfigModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)MambaMixer2)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )GraniteMoeMoE)GraniteMoeSharedMLP)HasInnerStateIsHybridSupportsLoRASupportsMambaPrefixCaching
SupportsPPSupportsQuant)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                        e Zd Z	 	 	 	 ddedededz  dedz  dedz  ded	df fd
Z	de
j        de
j        dz  fdZ xZS )!GraniteMoeHybridMambaDecoderLayerN config	layer_idxmodel_configcache_configquant_configprefixreturnc                    t                                                       || _        |j        | _        |j        | _        t          |j        |j        |j        |j        |j        z  |j	        |j
        |j        |j        |j        |j        |j        |||| d          | _        d | _        t%          |dd          dk    r1t'          |j        |j        |j        |j        || d          | _        t%          |dd          dk    rd nt/          ||| d	          | _        t3          |j        |j        
          | _        t3          |j        |j        
          | _        d S )Nz.mixer)hidden_sizessm_state_sizeconv_kernel_sizeintermediate_sizeuse_conv_biasuse_biasn_groups	num_headshead_dimrms_norm_eps
activationr/   r0   r1   r2   num_local_expertsr   .block_sparse_moenum_expertstop_kr5   r8   r1   r2   shared_intermediate_size.shared_mlpr1   r2   eps)super__init__r-   r5   residual_multiplierr   mamba_d_statemamba_d_convmamba_expandmamba_conv_biasmamba_proj_biasmamba_n_groupsmamba_n_headsmamba_d_headr>   
hidden_actmambablock_sparse_moegetattrr   r@   num_experts_per_tokr8   r   
shared_mlpr   input_layernormpost_attention_layernormselfr-   r.   r/   r0   r1   r2   	__class__s          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/granitemoehybrid.pyrK   z*GraniteMoeHybridMambaDecoderLayer.__init__8   s    	!-#)#=  *!/#0$1F4FF 0+**(,(%%%$$$
 
 

$ !%6.22Q66$1"40"."(":) 333% % %D! v91==BB D$\V:P:P:P   	  'v'9v?RSSS(/F$7)
 )
 )
%%%    hidden_statesresidualc                    |}|                      |          }|                     |          }||| j        z  z   }|}|                     |          }| j        | j        |                     |          }n_| j        C|                                }|                     |          }||                     |          z   }~n|                     |          }||| j        z  z   }||fS N)r[   rV   rL   r\   rZ   rW   clone)r^   rb   rc   kwargsoutputmoe_hidden_statess         r`   forwardz)GraniteMoeHybridMambaDecoderLayer.forwardp   s     !,,];;M** 6D,D#DD 55mDD?"$0 $ 5 5m D D $0$1$7$7$9$9!$($9$9:K$L$L! 1DOOM4R4R R%% $ > > =43K#KKh&&ra   NNNr,   __name__
__module____qualname__r   intr	   r   r   strrK   torchTensorrj   __classcell__r_   s   @r`   r+   r+   7   s        
 ,0+/266
 6
&6
 6
 "D(	6

 "D(6
 )4/6
 6
 
6
 6
 6
 6
 6
 6
p'|' ,%' ' ' ' ' ' ' 'ra   r+   c                        e Zd Z	 	 	 	 ddedededz  dedz  dedz  ded	df fd
Z	de
j        de
j        de
j        dz  d	e
j        fdZ xZS )%GraniteMoeHybridAttentionDecoderLayerNr,   r-   r.   r/   r0   r1   r2   r3   c           	      *   t                                                       |j        | _        |j        | _        t	          |||| d          | _        d | _        t          |dd          dk    r1t          |j	        |j
        |j        |j        || d          | _        t          |dd          dk    rd nt          ||| d	          | _        t          |j        |j        
          | _        t          |j        |j        
          | _        d S )Nz
.self_attn)r0   r1   r2   r@   r   rA   rB   rE   rF   rG   rH   )rJ   rK   r5   rL   GraniteMoeHybridAttention	self_attnrW   rX   r   r@   rY   r8   r   rZ   r   r>   r[   r\   r]   s          r`   rK   z.GraniteMoeHybridAttentionDecoderLayer.__init__   sK    	!-#)#= 2%%(((	
 
 
 !%6.22Q66$1"40"."(":) 333% % %D! v91==BB D$\V:P:P:P   	  'v'9v?RSSS(/F$7)
 )
 )
%%%ra   	positionsrb   rc   c                    |}|                      |          }|                     ||          }||| j        z  z   }|}|                     |          }| j        | j        |                     |          }n_| j        C|                                }|                     |          }||                     |          z   }~n|                     |          }||| j        z  z   }||fS )N)r{   rb   )r[   rz   rL   r\   rZ   rW   rf   )r^   r{   rb   rc   ri   s        r`   rj   z-GraniteMoeHybridAttentionDecoderLayer.forward   s    !,,];;' ' 
 
 !=43K#KK 55mDD?"$0 $ 5 5m D D $0$1$7$7$9$9!$($9$9:K$L$L! 1DOOM4R4R R%% $ > > =43K#KKh&&ra   rk   rl   ru   s   @r`   rw   rw      s        
 ,0+/26*
 *
&*
 *
 "D(	*

 "D(*
 )4/*
 *
 
*
 *
 *
 *
 *
 *
X '< ' | ' ,%	 '
 
 '  '  '  '  '  '  '  'ra   rw   c                        e Zd Z	 	 	 	 ddededz  dedz  dedz  deddf fd	Zd
e	j
        de	j
        de	j
        fdZ xZS )ry   Nr,   r-   r/   r0   r1   r2   r3   c           
      ~   t                                                       d| _        |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | _        |j	        | _
        t                      }| j        |z  dk    sJ | j        |z  | _        | j
        |k    r| j
        |z  dk    sJ n|| j
        z  dk    sJ t          d| j
        |z            | _	        t          | j        | j        | j        | j
        | j        || d          | _        t!          | j        | j        | j        || d          | _        |j        dk    r(t'          | j        |j        |j        d          | _        nd | _        t/          | j        | j        | j        | j	        ||| d	
          | _        d S )NTr   r   	.qkv_proj)biasr1   r2   z.o_projrope)max_positionrope_parametersis_neox_stylez.attn)num_kv_headsr0   r1   r2   )rJ   rK   causalr5   attention_biasattention_multipliernum_attention_headstotal_num_headsr=   num_key_value_headstotal_num_kv_headsr   r<   maxr   qkv_projr   o_projposition_embedding_typer   max_position_embeddingsr   
rotary_embr   attn)r^   r-   r/   r0   r1   r2   tp_sizer_   s          r`   rK   z"GraniteMoeHybridAttention.__init__   s    	!-$3$*$?!%9(D,@@"("< 788#g-2222-8"g-- *W499999 T449999#&q$*AW*L#M#M )M #$%'''
 
 
 ($%%%%
 
 
 )V33&#; & 6"	  DOO #DONM%1%%###
 
 
			ra   r{   rb   c                 l   |                      |          \  }}|                    | j        | j        z  | j        | j        z  | j        | j        z  gd          \  }}}| j        |                     |||          \  }}|                     |||          }~~~|                     |          d         }|S )Ndimr   )r   splitr<   r=   r   r   r   r   )r^   r{   rb   qkv_querykeyvalues           r`   rj   z!GraniteMoeHybridAttention.forward%  s    
 }--QII.(4=8(4=8
  & 
 
sE ?&E3??JE3		%e443M2215ra   rk   )rm   rn   ro   r   r	   r   r   rq   rK   rr   rs   rj   rt   ru   s   @r`   ry   ry      s         ,0+/26C
 C
&C
 "D(C
 "D(	C

 )4/C
 C
 
C
 C
 C
 C
 C
 C
J< | 
	       ra   ry   )	attentionrV   c                       e Zd Zdddedef fdZdej        dej        fdZ	 	 ddej        d
ej        de	d	z  dej        d	z  dej        f
dZ
deeeeeef                  fdZdeeeej        f                  dee         fdZ xZS )GraniteMoeHybridModelr,   r2   vllm_configr2   c                   t                                                       |j        j        |j        |j        |j        | _        | _        j        | _        t          | j        j	                  | _
        j        | _        dt          ffd}t          j        || d          \  | _        | _        | _        t%          ddgj	                  | _        t)          j	        j                  | _        d S )Nr2   c                     t          |                     dd          d                   }t          j        |                  } |||           S )N.r   rG   )rp   rsplitALL_DECODER_LAYER_TYPESlayer_types)r2   r.   layer_classr0   r-   r/   r1   s      r`   	get_layerz1GraniteMoeHybridModel.__init__.<locals>.get_layerY  s`    FMM#q11!455I1&2DY2OPK;)   ra   z.layersr   rb   rc   rH   )rJ   rK   r/   	hf_configr0   r1   r-   
vocab_sizer   r5   embed_tokensembedding_multiplierrq   r(   num_hidden_layersstart_layer	end_layerlayersr'   make_empty_intermediate_tensorsr   r>   norm)	r^   r   r2   r   r0   r-   r/   r1   r_   s	       @@@@r`   rK   zGraniteMoeHybridModel.__init__F  s1   )3"/"/"/( +2O
 
 %+$?!
	c 
	 
	 
	 
	 
	 
	 
	 
	 
	 9D$i68J8J8J9
 9
 9
5$.$+ 0Wj)6+=0
 0
, F.F4GHHH			ra   	input_idsr3   c                 ,    |                      |          S re   )r   r^   r   s     r`   embed_input_idsz%GraniteMoeHybridModel.embed_input_idsn  s      +++ra   Nr{   intermediate_tensorsinputs_embedsc                    t                      j        r'||}n|                     |          }|| j        z  }d }n!|t	          d          |d         }|d         }d}t          | j                  D ]0\  }}	t          |	t                    r|dz  } |	|||          \  }}1t                      j	        st          ||d          S |                     |          }|S )Nz%Intermediate tensors may not be None!rb   rc   r   r   )r{   rb   rc   )rb   rc   )r   is_first_rankr   r   RuntimeError	enumerater   
isinstancerw   is_last_rankr   r   )
r^   r   r{   r   r   rb   rc   num_attnilayers
             r`   rj   zGraniteMoeHybridModel.forwardq  s    >>' 	8( - $ 4 4Y ? ? -0I IHH#+"#JKKK0AM+J7H!$+.. 	 	HAu%!FGG A&+e#=8' ' '#M88 ~~* 	&"/XFF   		-00ra   c                 f    ddd| j         j        }fdt          |          D             S )N	gate_proj	down_projup_projc           	      \    g | ](}d fdfdffD ]\  }}|fv rdndd| d| d||f)S )w1w2w3zblock_sparse_moe.experts.w13_zblock_sparse_moe.experts.w2_zblock_sparse_moe.experts.r    ).0	expert_idshard_idweight_nameckpt_down_proj_nameckpt_gate_proj_nameckpt_up_proj_names       r`   
<listcomp>z<GraniteMoeHybridModel.get_expert_mapping.<locals>.<listcomp>  s     
 
 
 *+*+()*
 
 &+ #68I"JJJ 0/3FIFFFFF
 
 
 
ra   )r-   r@   range)r^   rC   r   r   r   s     @@@r`   get_expert_mappingz(GraniteMoeHybridModel.get_expert_mapping  sb     *)%k3
 
 
 
 
 
 #;//
 
 
 	
ra   weightsc           	           g d}t                                                     t                                                       fd} fd}fd} fd}|D ]\  }}d|v r|                    dd          } j        b j                            |          x}	rF|}
|
                                dk    r|
n|
d         }
 ||	|
                               |	            |||          r|	                    d	          s|	                    d
          rt          |                    d                    D ]}|                    d	d| d          }|                    d	d| d          }||                             dd          \  }} ||                    dd          ||d|            ||                    dd          ||d|           |	                    d          s|	                    d          rmt          |                    d                    D ]H}|                    dd| d          }||         } ||                    dd          ||d|           I|	                    d          r$|                    dd          } |||           Sd}|D ].\  }}}||v r$ ||                    ||          ||           d}/|s |||           S ) N))r   z.q_projq)r   z.k_projk)r   z.v_projvc                     |          }t          |dt                    } |||                               |            d S Nweight_loaderrX   r   add)npparamr   loaded_paramsparams_dicts       r`   _loadz1GraniteMoeHybridModel.load_weights.<locals>._load  sJ    NE#E?<QRRMM%###a     ra   c                     t          |           sB|          }t          |dt                    } ||||                               |            d S d S r   )r&   rX   r   r   )r   r   r   r   r   r   r   r^   s        r`   _load_shardz7GraniteMoeHybridModel.load_weights.<locals>._load_shard  sh    *1d33 %#A '@U V VeQ111!!!$$$$$	% %ra   c                     |          }t          |dt                    } ||||||                               |            d S )Nr   r   r   r   )	r   r   namer   r   r   r   r   r   s	          r`   _load_expertz8GraniteMoeHybridModel.load_weights.<locals>._load_expert  sS    NE#E?<QRRMM%D8yQQQQa     ra   c           	          D ]_}|\  }}}}|| vr|                      ||          }t          |          r5|         }|j        }	d}
|	 |	|||||d          }
|
r|c S `d S )NFT)r   r   return_success)replacer&   r   )r   loaded_weightmapping
param_namer   r   r   name_mappedr   r   successexpert_params_mappingr   r^   s              r`   _load_quant_expertz>GraniteMoeHybridModel.load_weights.<locals>._load_quant_expert  s    0 ' '?F<
KHd**"ll;
CC +;== #K0 % 3 ,+m%#!)"+'+  G  '&&&&'4ra   A_logAr   z%.block_sparse_moe.input_linear.weightz+.block_sparse_moe.input_linear.weight_scalez.block_sparse_moe.experts.z
.w1.weightz
.w3.weight   r   z.input_linear.z.experts.w13_r   r   r   z&.block_sparse_moe.output_linear.weightz,.block_sparse_moe.output_linear.weight_scalez
.w2.weightz.output_linear.z.experts.w2_r   z%.block_sparse_moe.router.layer.weightz.block_sparse_moe.gate.weightF)r   T)dictnamed_parameterssetr   r   r1   get_cache_scaler   r   endswithr   sizechunk)r^   r   stacked_params_mappingr   r   r   r   r   r   
scale_namer   ew1_namew3_namew1_paramw3_paramw2_namew2_param	gate_nameloadedr   r   r   r   r   r   s   `                      @@@r`   load_weightsz"GraniteMoeHybridModel.load_weights  sY   "
 "
 "
 4002233"%%% $ 7 7 9 9	! 	! 	! 	! 	! 	!	% 	% 	% 	% 	% 	% 	%	! 	! 	! 	! 	! 	!	 	 	 	 	 	 	>  S	  S	 DAq!||IIgs++ ,"/??BBB
 - !"%2%6%6%8%8A%=%=MM=QRCS  j-000!!*---!!!Q''  zzABB : ajj=G G :  qvvayy))  Aii?BQBBB G  ii?BQBBB G *+1A1)=)=&Hh L		"2ODD !%"#    !L		"2ODD !%"#    #0 DEE  >J J   qvvayy))  Aii@BQBBB G  !tH L		"3^DD !%"#     CDD  II;3 	 i####9O & &5JX"a''#IIk:>>H    "&  E!QKKKra   NN)rm   rn   ro   r
   rq   rK   rr   rs   r   r   rj   listtuplerp   r   r   r   r  rt   ru   s   @r`   r   r   D  s]       AC &I &I &Iz &I3 &I &I &I &I &I &IP, ,%, , , , , <@-1" "<" <" 2D8	"
 |d*" 
" " " "H
DsCc/A)B$C 
 
 
 
6SHU33D-E$F S3s8 S S S S S S S Sra   r   c            
           e Zd Zg ddgdgdgdZdddZed	d
deej        ej        f         fd            Z	ed	d
deee
e
f         ee
e
e
f         f         fd            Zedeeef         fd            Zddd	edef fdZdej        dej        fdZ	 	 ddej        dej        dedz  dej        dz  fdZdej        dej        dz  fdZdeeeej        f                  dee         fdZ xZS )GraniteMoeHybridForCausalLM)q_projk_projv_projconv1din_projinput_linear)r   r  r  r  input_embeddingsoutput_embeddings)r   lm_headr   r
   r3   c                 j    t          j        |j        j        |j        j        |j        j                  S re   )r   mamba2_state_dtyper/   dtyper0   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   s     r`   !get_mamba_state_dtype_from_configz=GraniteMoeHybridForCausalLM.get_mamba_state_dtype_from_config^  s4    
 );$*$6$:
 
 	
ra   c           	          |j         }|j        j        }|j        |j        z  }t          j        ||j        |j        |j	        |j
        |j        |j                  S )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        )r8   tp_world_sizer;   r<   r=   
state_sizeconv_kernel)parallel_configr/   r   rO   r5   r   mamba2_state_shapetensor_parallel_sizerR   rS   rT   rM   rN   )r!  r   r'  r   r8   s        r`   !get_mamba_state_shape_from_configz=GraniteMoeHybridForCausalLM.get_mamba_state_shape_from_configi  si     &5,6	%2Y5JJ(;/)>--+ .!.
 
 
 	
ra   c                 (    t          j                    S re   )r   mamba2_state_copy_func)r!  s    r`   get_mamba_state_copy_funcz5GraniteMoeHybridForCausalLM.get_mamba_state_copy_func  s    +BDDDra   r,   r   r2   c          	      <   t                                                       |j        j        }|| _        |j        | _        |j        }|j        | _        || _        || _        t          |t          |d                    | _
        t          |j        |j        | j        t          |d                    | _        |j        r| j
        j        j        | j        _        t%          |j        |j        d| j        j        z            | _        | j
        j        | _        d S )Nmodel)r   r2   r  rG   r   )scale)rJ   rK   r/   r   r   scheduler_configr1   r-   r   r)   r/  r   r   r5   r  tie_word_embeddingsr   weightr   logits_scalinglogits_processorr   )r^   r   r2   r-   r1  r_   s        r`   rK   z$GraniteMoeHybridForCausalLM.__init__  s   )3&'4&7'4 0*#L,I,I
 
 

 &*	22	
 
 
 % 	A"&*"9"@DL /dk00!
 !
 !
 J6 	,,,ra   r   c                 6    | j                             |          S re   )r/  r   r   s     r`   r   z+GraniteMoeHybridForCausalLM.embed_input_ids  s    z)))444ra   Nr{   r   r   c                 6    |                      ||||          }|S re   )r/  )r^   r   r{   r   r   rg   rb   s          r`   rj   z#GraniteMoeHybridForCausalLM.forward  s)     

y"6
 
 ra   rb   c                 <    |                      | j        |          }|S re   )r5  r  )r^   rb   logitss      r`   compute_logitsz*GraniteMoeHybridForCausalLM.compute_logits  s      &&t|]CCra   r   c                 J    t          |           }|                    |          S re   )r%   r  )r^   r   loaders      r`   r  z(GraniteMoeHybridForCausalLM.load_weights  s#    "4((""7+++ra   r  )rm   rn   ro   packed_modules_mappingembedding_modulesclassmethodr  rr   r  r"  rp   r*  r   r-  r
   rq   rK   rs   r   r   rj   r:  r   r   r  rt   ru   s   @r`   r  r  F  sK       
 
 

 *;'(	 	 +& 
 
!
 
u{EK'	(
 
 
 [
 
!
 
uS#Xc3m 44	5
 
 
 [
8 E%0BDV0V*W E E E [E BD 
 
 
z 
3 
 
 
 
 
 
B5 5%, 5 5 5 5 <@-1 < < 2D8	
 |d*   | 
	   ,HU33D-E$F ,3s8 , , , , , , , ,ra   r  )D__doc__collections.abcr   rr   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   vllm.distributed.parallel_stater   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   -vllm.model_executor.layers.mamba.mamba_mixer2r   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
granitemoer   granitemoesharedr   
interfacesr   r    r!   r"   r#   r$   utilsr%   r&   r'   r(   r)   Moduler+   rw   ry   r   r   r  r   ra   r`   <module>rW     s   - , % $ $ $ $ $        / / / / / / * * * * * * = = = = = = < < < < < < < < < < A A A A A A 8 8 8 8 8 8 8 8 8 8 8 8 R R R R R R R R G G G G G G E E E E E E            G F F F F F @ @ @ @ @ @        P O O O O O - - - - - - % % % % % % 1 1 1 1 1 1                            U' U' U' U' U'	 U' U' U'pM' M' M' M' M'BI M' M' M'`\ \ \ \ \	 \ \ \@ 7.   ~ ~ ~ ~ ~BI ~ ~ ~B, , , , ,I, , , , ,ra   