
    .`iH                     "   d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5  ee6          Z7 G d dej8                  Z9 G d dej8                  Z: G d dej8                  Z;e G d d ej8                              Z< G d! d"ej8        e/e.          Z=dS )#z?Inference-only OLMoE model compatible with HuggingFace weights.    )Iterable)partial)isliceN)nn)	Attention)support_torch_compile)
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)split_tensor_along_last_dim)init_logger)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                        e Zd ZdZ	 	 	 	 ddededededej        dz  d	edz  d
edz  def fdZ	dej
        dej
        fdZ xZS )OlmoeMoEa  A tensor-parallel MoE implementation for Olmoe that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_khidden_sizeintermediate_sizeparams_dtypequant_configtp_sizeprefixc	                     t                                                       || _        t          ||dd | d          | _        t          ||||dd||| d	  	        | _        d S )NFz.gatebiasr,   r.   Tz.experts)	r'   r(   r)   r*   reduce_resultsrenormalizer,   r-   r.   )super__init__r)   r   gater   experts)
selfr'   r(   r)   r*   r+   r,   r-   r.   	__class__s
            t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/olmoe.pyr5   zOlmoeMoE.__init__I   s     	& %###
 
 
	  ##/%&&&

 

 

    hidden_statesreturnc                     |j         }|j         d         }|                    d|          }|                     |          \  }}|                     ||          }|                    |          S )N)r<   router_logits)shapeviewr6   r7   )r8   r<   
orig_shape
hidden_dimr@   _final_hidden_statess          r:   forwardzOlmoeMoE.forwardl   sv    "(
"(,
%**2z::99]33q"ll'} + 
 
 #''
333r;   )NNNr&   )__name__
__module____qualname____doc__inttorchdtyper   strr5   TensorrG   __classcell__r9   s   @r:   r%   r%   @   s          ,026"!
 !
!
 !
 	!

 !
 kD(!
 )4/!
 t!
 !
 !
 !
 !
 !
 !
F
4U\ 
4el 
4 
4 
4 
4 
4 
4 
4 
4r;   r%   c                        e Zd Zdddededdf fdZdej        d	ej        deej        ej        f         fd
Z	dej        dej        dej        fdZ
 xZS )OlmoeAttentionr&   r.   vllm_configr.   r=   Nc          
         t                                                       |j        j        }|j        }|j        }|j        | _        t          |dd          }|j        }|j	        }t                      }	|| _        | j        |	z  dk    sJ | j        |	z  | _        || _        | j        |	k    r| j        |	z  dk    sJ n|	| j        z  dk    sJ t          d| j        |	z            | _        | j        | j        z  | _        | j        | j        z  | _        | j        | j        z  | _        | j        dz  | _        || _        t+          | j        | j        | j        | j        d|| d          | _        |	| _        t1                      | _        t5          | j        | j        z  d	
          | _        t5          | j        | j        z  d	
          | _        t;          | j        | j        z  | j        d|| d          | _        t?          | j        ||j         d          | _!        tE          | j        | j        | j        | j        ||| d          | _#        d S )Nmax_position_embeddingsi   r   r   g      Fz	.qkv_projr0   h㈵>epsz.o_projT)max_positionrope_parametersis_neox_stylez.attn)num_kv_headscache_configr,   r.   )$r4   r5   model_config	hf_configr`   r,   r)   getattrnum_attention_headsnum_key_value_headsr   total_num_heads	num_headstotal_num_kv_headsmaxr_   head_dimq_sizekv_sizescalingrX   r   qkv_projr-   r   tp_rankr   q_normk_normr   o_projr   r]   
rotary_embr   attn)r8   rV   r.   configr`   r,   rX   rg   r_   r-   r9   s             r:   r5   zOlmoeAttention.__init__z   s   )3"/"/!-")&2KT"R"R.	1688(#g-2222-8"."g-- *W499999 T4499994#:g#EFF(D,@@nt}4(4=8}d*'>$)M #%'''
 
 
 577d2T]BMMMd5E4PPP' 4=0%%%%
 
 
 #M0"2	
 
 
 NML*%%###
 
 
			r;   qkc                    | j         dk    rBt          |                                          }t          |                                          }|                     |          }|                     |          }| j         dk    rGt          t          | j                   } ||          | j                 } ||          | j                 }||fS )Nr   )num_partitions)r-   r   
contiguousrp   rq   r   r   ro   )r8   rv   rw   splitters       r:   _apply_qk_normzOlmoeAttention._apply_qk_norm   s     <!0@@A0@@AKKNNKKNN<!:4<XXXHDL)ADL)A!tr;   	positionsr<   c                 T   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     ||          \  }}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )Nr?   )dim)rn   splitrk   rl   r|   rs   rt   rr   )
r8   r}   r<   qkvrE   rv   rw   vattn_outputoutputs
             r:   rG   zOlmoeAttention.forward   s    
 }--Q))T[$,E2)NN1a""1a((1y!Q//1ii1a((KK,,	r;   )rH   rI   rJ   r	   rO   r5   rM   rP   tupler|   rG   rQ   rR   s   @r:   rT   rT   y   s        AC D
 D
 D
z D
3 D
 D
 D
 D
 D
 D
 D
L"',	u|U\)	*   < | 
	       r;   rT   c                   x     e Zd Zdddededdf fdZdej        d	ej        d
ej        dz  dej        fdZ xZ	S )OlmoeDecoderLayerr&   rU   rV   r.   r=   Nc          	         t                                                       |j        j        }|j        }|j        | _        t          || d          | _        t          |j	        |j
        |j        |j        || d          | _        t          |j        d          | _        t          |j        d          | _        d S )Nz
.self_attnrV   r.   z.mlp)r'   r(   r)   r*   r,   r.   rY   rZ   )r4   r5   ra   rb   r,   r)   rT   	self_attnr%   r'   num_experts_per_tokr*   mlpr   input_layernormpost_attention_layernorm)r8   rV   r.   ru   r,   r9   s        r:   r5   zOlmoeDecoderLayer.__init__   s    )3"/!-'#(((
 
 

 *,*$6%???
 
 
  'v'9tDDD(/0B(M(M(M%%%r;   r}   r<   residualc                     ||}|                      |          }n|                      ||          \  }}|                     ||          }|                     ||          \  }}|                     |          }||fS )N)r}   r<   )r   r   r   r   )r8   r}   r<   r   s       r:   rG   zOlmoeDecoderLayer.forward   s     $H 00??MM&*&:&:=(&S&S#M8' ' 
 
 #'"?"?x"X"Xx//h&&r;   )
rH   rI   rJ   r	   rO   r5   rM   rP   rG   rQ   rR   s   @r:   r   r      s        AC N N Nz N3 N N N N N N N.'<' |' ,%	'
 
' ' ' ' ' ' ' 'r;   r   c                   6    e Zd Zdeddededeej                 f fdZ	de
j        de
j        fd	Z	 dde
j        de
j        ded
z  de
j        d
z  de
j        ez  f
dZdeeeeeef                  fdZdeeee
j        f                  dee         fdZ xZS )
OlmoeModelr&   r.   
layer_typerV   r.   r   c                   t                                                       j        j        }|j        | _        || _        t          |j        |j                  | _        t          |j
        fd| d          \  | _        | _        | _        t          |j        d          | _        t!          ddg|j                  | _        d S )Nc                      |           S )Nr    )r.   r   rV   s    r:   <lambda>z%OlmoeModel.__init__.<locals>.<lambda>!  s    ::+fMMM r;   z.layersrU   rY   rZ   r<   r   )r4   r5   ra   rb   
vocab_sizeru   r   r)   embed_tokensr"   num_hidden_layersstart_layer	end_layerlayersr   normr!   make_empty_intermediate_tensors)r8   rV   r.   r   ru   r9   s    ` ` r:   r5   zOlmoeModel.__init__  s     	)3 +2
 
 9D$MMMMM%%%9
 9
 9
5$.$+
 F.D999	/Vj)6+=0
 0
,,,r;   	input_idsr=   c                 ,    |                      |          S N)r   r8   r   s     r:   embed_input_idszOlmoeModel.embed_input_ids*  s      +++r;   Nr}   intermediate_tensorsinputs_embedsc                    t                      j        r||}n|                     |          }d }n|J |d         }|d         }t          | j        | j        | j                  D ]} ||||          \  }}t                      j        st          ||d          S || 	                    ||          \  }}n| 	                    |          }|S )Nr<   r   )r<   r   )
r
   is_first_rankr   r   r   r   r   is_last_rankr   r   )	r8   r   r}   r   r   r<   r   layerrE   s	            r:   rG   zOlmoeModel.forward-  s    >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7HDK)94>JJ 	 	E&+e' '#M88 ~~* 	&"/XFF   #yyAAM11 IIm44Mr;   c                 H    t          j        | ddd| j        j                  S )N	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer'   )r   make_expert_params_mappingru   r'   r8   s    r:   get_expert_mappingzOlmoeModel.get_expert_mappingQ  s2     2 + +'/
 
 
 	
r;   weightsc           	         g d}t          |                                           }t                      }|                                 }|D ]\  }}|D ]s\  }}	}
|	|vrd|v r|                    |	|          }|                    d          r||vr@t          ||           rQ||vrV||         }|j        } ||||
            n|D ]U}|\  }}	}}
|	|vr|                    |	|          }t          ||           r5||         }|j        } |||||
|            n|                    d          r||vrt          ||           r|                    d          r:|                    dd          }||vrt          	                    d||           L|}||         }t          |d	t                    } |||           |                    |           |S )
N))rn   q_projrv   )rn   k_projrw   )rn   v_projr   zmlp.expertsz.bias)shard_id	expert_idkv_scalez	.kv_scalez.attn.kv_scalez{Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.weight_loader)dictnamed_parameterssetr   replaceendswithr    r   loggerwarning_oncerc   r   add)r8   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   mappingr   remapped_kv_scale_names                   r:   load_weightszOlmoeModel.load_weights\  s   "
 "
 "
 4002233"%%% $ 7 7 9 9#* I	$ I	$D-5K G8 G81
Kd** !D((||K<<==)) d+.E.E*466 {**#D) % 3e]H===4 ,8 ,8GCJ@JY"$.. <<Z@@D.tT:: ! '-E$)$7M!M%!)"+    E }}W-- !$k2I2I .tT:: ! }}Z00 :15')92 2. 2DD"// !^ $ 6  
 %#9D'-E$+0E% %M "M%777d####r;   r   )rH   rI   rJ   r   r	   rO   typer   Moduler5   rM   rP   r   r   rG   listr   rL   r   r   r   r   rQ   rR   s   @r:   r   r     so        &7
 
 
  
 	

 O
 
 
 
 
 
8, ,%, , , , , .2" "<" <" 2D8	"
 |d*" 
+	+" " " "H	
DsCc/A)B$C 	
 	
 	
 	
UHU33D-E$F U3s8 U U U U U U U Ur;   r   c                   h    e Zd Zdg diZdeddededeej	                 f fdZ
d	ej        d
ej        fdZ	 	 dd	ej        dej        dedz  dej        dz  d
ej        ez  f
dZdej        d
ej        fdZdeeeej        f                  d
ee         fdZd
eeeeeef                  fdZ xZS )OlmoeForCausalLMrn   )r   r   r   r&   r   rV   r.   r   c          	         t                                                       |j        j        }|j        }|| _        || _        t          |t          |d          |          | _        t          |j
        |j        |t          |d                    | _        t          |j
                  | _        | j        j        | _        d S )Nmodel)rV   r.   r   lm_head)r,   r.   )r4   r5   ra   rb   r,   ru   r   r#   r   r   r   r)   r   r   logits_processorr   )r8   rV   r.   r   ru   r,   r9   s         r:   r5   zOlmoeForCausalLM.__init__  s     	)3"/(#00!
 
 


 &%	22	
 
 
 !00A B B J6 	,,,r;   r   r=   c                 6    | j                             |          S r   )r   r   r   s     r:   r   z OlmoeForCausalLM.embed_input_ids  s    z)))444r;   Nr}   r   r   c                 6    |                      ||||          }|S r   )r   )r8   r   r}   r   r   r<   s         r:   rG   zOlmoeForCausalLM.forward  s)     

y"6
 
 r;   r<   c                 <    |                      | j        |          }|S r   )r   r   )r8   r<   logitss      r:   compute_logitszOlmoeForCausalLM.compute_logits  s    &&t|]CCr;   r   c                 J    t          |           }|                    |          S r   )r   r   )r8   r   loaders      r:   r   zOlmoeForCausalLM.load_weights  s#    "4((""7+++r;   c                 4    | j                                         S r   )r   r   r   s    r:   r   z#OlmoeForCausalLM.get_expert_mapping  s    z,,...r;   )NN)rH   rI   rJ   packed_modules_mappingr   r	   rO   r   r   r   r5   rM   rP   r   r   rG   r   r   r   r   r   r   rL   r   rQ   rR   s   @r:   r   r     s        
 
 
 &7
 
 
  
 	

 O
 
 
 
 
 
:5 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
EL U\    ,HU33D-E$F ,3s8 , , , ,/DsCc/A)B$C / / / / / / / /r;   r   )>rK   collections.abcr   	functoolsr   	itertoolsr   rM   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   vllm.distributedr
   r   r   r   vllm.distributed.utilsr   vllm.loggerr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r    r!   r"   r#   rH   r   r   r%   rT   r   r   r   r   r;   r:   <module>r      sO   F E $ $ $ $ $ $                    * * * * * * = = = = = = " " " " " "            ? > > > > > # # # # # # 9 9 9 9 9 9 8 8 8 8 8 8         
 H G G G G G F F F F F F @ @ @ @ @ @        P O O O O O - - - - - - 0 0 0 0 0 0 0 0              
X		64 64 64 64 64ry 64 64 64r` ` ` ` `RY ` ` `F-' -' -' -' -'	 -' -' -'` d d d d d d d dN>/ >/ >/ >/ >/ry*l >/ >/ >/ >/ >/r;   