
    .`iU                     l   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<  ee=          Z> G d dej?                  Z@ G d dej?                  ZA G d d ej?                  ZB G d! d"ej?                  ZCe
 G d# d$ej?                              ZD G d% d&ej?        e5e6          ZEdS )'z&Inference-only Snowflake Arctic model.    )Iterable)isliceN)nn)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)init_logger)
SiluAndMul)fused_experts
fused_topk)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)set_weight_attrs)current_platform)IntermediateTensors)ArcticConfig   )
SupportsPPSupportsQuant)extract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   P     e Zd Z	 	 	 	 	 ddededed	edz  d
edef fdZd Z	 xZ
S )	ArcticMLPFNT config	expert_idis_residual_mlpquant_configreduce_resultsprefixc           	         t                                                       |j        | _        || _        |s|j        n| j        | _        t          | j        | j        gdz  d|| d          | _        t          | j        | j        d||| d          | _	        |j
        dk    rt          d|j
         d	          t                      | _        d S )
N   Fz.w13biasr0   r2   z.w2r6   r1   r0   r2   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__hidden_sizer.   intermediate_sizeffn_dimr   w13r   w2
hidden_act
ValueErrorr   act_fn)selfr-   r.   r/   r0   r1   r2   	__class__s          u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/arctic.pyr:   zArcticMLP.__init__8   s    	!-" -<QF$$AQ 	 .\NQ%???
 
 
 $L)%>>>
 
 
 &&26+< 2 2 2   !ll    c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r>   rB   r?   )rC   hidden_statesgate_up_s       rE   forwardzArcticMLP.forward_   sD    XXm,,
G,,77=11qrF   )r+   FNTr,   )__name__
__module____qualname__r    intboolr   strr:   rL   __classcell__rD   s   @rE   r*   r*   7   s          %26#%# %#%# %# 	%#
 )4/%# %# %# %# %# %# %# %#N      rF   r*   c                        e Zd ZdZ	 	 	 	 	 ddededz  dej        dz  dedz  d	e	d
e
f fdZdej        dej        de
defdZdej        dej        fdZdej        fdZ xZS )	ArcticMoEz<
    Model-parallel implementation of Arctic MoE Layer.
    NTr,   r-   tp_sizeparams_dtyper0   r1   r2   c           	         t                                                       t          |          }|pt                      | _        |j        | _        |j        | _        || _        |j	        | _
        |j        | j        z  | _        |dz   |j        z  dk    | _        || _        |t          j                    }|| _        | j        st%          |||| d          | _        d S t)          | j        | j        d| j        || d          | _        t-          j        t          j        | j        d| j        z  | j        t2          j        | j        	                    | _        t-          j        t          j        | j        | j        | j        t2          j        | j        	                    | _        t;          | j        d
| j        i           t;          | j        d
| j        i           d S )Nr!   r   z.mlpr0   r1   r2   Fz.gate)r6   rX   r0   r2   r4   )devicedtypeweight_loader)r9   r:   r$   r   rW   r;   num_local_expertsnum_expertslayer_idnum_experts_per_toktop_kr<   moe_layer_frequencyis_moe_layerr1   torchget_default_dtyperX   r*   mlpr   gater   	Parameteremptyr   device_typewsw2sr   r]   )	rC   r-   rW   rX   r0   r1   r2   r`   rD   s	           rE   r:   zArcticMoE.__init__k   s    	&v..H"F"H"H!-!3 /
!'!9T\!I%\V-GG1L, 244L(  -	 )- 	  DHHH )  !.) '''  DI l$..$+7+   DG |$$*+7+   DH #T%7   #T%7    rF   paramloaded_weightweight_namer.   c                 |   t                      }|j        }| j        }t          ||z  |dz   |z            }|                    d          r||d d f         ||d|d d f<   |                    d          r||d d f         |||d|z  d d f<   |                    d          r|d d |f         ||d d d d f<   d S d S )Nr!   z	w1.weightr   z	w3.weightr4   z	w2.weight)r   datar<   sliceendswith)	rC   rn   ro   rp   r.   tp_rank
param_data
shard_sizeshards	            rE   r]   zArcticMoE.weight_loader   s    122Z
+
g
*Wq[J,FGG,, 	M5B5!!!85LJy!J,12,, 	DQqqqEJy*q:~"=qqq@A ,, 	B*75*AJy!!!QQQ'''	B 	BrF   rI   returnc                 v   |j         \  }}|                    d| j                  }|                     |          \  }}| j        dk    }t          ||| j        |          \  }}}	t          || j        | j        ||d          }
| j	        r| j
        dk    rt          |
          }
|
                    ||          S )Nr+   r!   )renormalizeT)inplace)shapeviewr;   rh   rb   r   r   rl   rm   r1   rW   r   )rC   rI   
num_tokensr;   router_logitsrK   do_normalizetopk_weightstopk_idstoken_expert_indicesfinal_hidden_statess              rE   local_moe_fusedzArcticMoE.local_moe_fused   s    "/"5
K%**2t/?@@99]33qzA~7A=$*,8
 8
 8
4h 4 ,GH
 
 
  	X4<!#3#3"BCV"W"W"''
K@@@rF   c                 j    | j         r|                     |          }n|                     |          }|S rH   )rd   r   rg   )rC   rI   r   s      rE   rL   zArcticMoE.forward   s<     	:"&"6"6}"E"E"&((="9"9""rF   )NNNTr,   )rM   rN   rO   __doc__r    rP   re   r\   r   rQ   rR   r:   r   ri   Tensorr]   r   rL   rS   rT   s   @rE   rV   rV   f   sD         #+/26#G GG tG kD(	G
 )4/G G G G G G G GRB|B |B 	B
 B B B B(AU\ Ael A A A A*#U\ # # # # # # # #rF   rV   c            	       v     e Zd Z	 	 	 ddededz  dedz  def fdZdej	        d	ej	        d
ej	        fdZ
 xZS )ArcticAttentionNr,   r-   cache_configr0   r2   c           
         t                                                       || _        |j        | _        t	                      }|j        | _        | j        |z  dk    sJ | j        |z  | _        |j        | _	        | j	        |k    r| j	        |z  dk    sJ n|| j	        z  dk    sJ t          d| j	        |z            | _        | j        | j        z  | _        | j        | j        z  | _        | j        | j        z  | _        |j        | _        | j        dz  | _        t#          | j        | j        | j        | j	        d|| d          | _        t'          | j        | j        z  | j        dd|| d	          | _        t+          | j        | j        |j        d
          | _        t1          | j        | j        | j        | j        ||| d          | _        d S )Nr   r!   g      Fz	.qkv_projr5   Tz.o_projr7   )max_positionrope_parametersis_neox_stylez.attn)num_kv_headsr   r0   r2   )r9   r:   r-   r;   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr   head_dimq_sizekv_sizemax_position_embeddingsscalingr   qkv_projr   o_projr   r   
rotary_embr   attn)rC   r-   r   r0   r2   rW   rD   s         rE   r:   zArcticAttention.__init__   s    	!-688%9#g-2222-8"("<"g--*W499999T4499994#:g#EFF(D,@@nt}4(4=8'-'E$}d*)M #%'''
 
 
 ( 4=0%%%%
 
 
 #M5"2	
 
 
 NML*%%###
 
 
			rF   	positionsrI   ry   c                 "   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )Nr+   )dim)r   splitr   r   r   r   r   )
rC   r   rI   qkvrK   qkvattn_outputoutputs
             rE   rL   zArcticAttention.forward%  s    
 }--Q))T[$,E2)NN1ay!Q//1ii1a((KK,,	rF   NNr,   rM   rN   rO   r    r   r   rR   r:   re   r   rL   rS   rT   s   @rE   r   r      s         ,026=
 =
=
 "D(=
 )4/	=

 =
 =
 =
 =
 =
 =
~
<
 |
 
	
 
 
 
 
 
 
 
rF   r   c                   z     e Zd Z	 	 	 ddededz  dedz  deddf
 fdZd	ej	        d
ej	        dej	        fdZ
 xZS )ArcticDecoderLayerNr,   r-   r   r0   r2   ry   c                 <   t                                                       |j        | _        t          |          }|dz   |j        z  dk    }|j        o|| _        t          |||| d          | _        t          ||| j         | d          | _	        t          |j        |j                  | _        t          |j        |j                  | _        | j        r=t          |j        |j                  | _        t          |dd	| d
          | _        d S d S )Nr!   r   z
.self_attnr0   r2   z.block_sparse_moerZ   epsTFz.residual_mlp)r/   r1   r2   )r9   r:   r;   r$   rc   use_residualr   	self_attnrV   block_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormresidual_layernormr*   residual_mlp)rC   r-   r   r0   r2   	layer_idxrd   rD   s          rE   r:   zArcticDecoderLayer.__init__3  sd    	!-'//	!A)CCqH"/@L(%(((	
 
 
 !*% $ 11///	!
 !
 !
  'v'9v?RSSS(/F$7)
 )
 )
%  		&-"(;' ' 'D# !* $$ ///	! ! !D			 		rF   r   rI   c                    |}|                      |          }|                     ||          }||z   }|}| j        rp|                     |          }|                     |          }|}|                     |          }|                     |          }||z   }t          |          }||z   }n/|                     |          }|                     |          }||z   }|S )N)r   rI   )r   r   r   r   r   r   r   r   )rC   r   rI   residual_inputresidual_attnr   s         rE   rL   zArcticDecoderLayer.forward\  s    
 ',,];;' ' 
 
 '6% 	: 33MBBM --m<<M(L 99.IIM 11-@@M(=8M<]KKM)M9MM 99-HHM 11-@@M)M9MrF   r   r   rT   s   @rE   r   r   2  s         ,026' '' "D(' )4/	'
 ' 
' ' ' ' ' 'R< | 
	       rF   r   c                        e Zd Zdddedef fdZdej        dej        fdZ	 ddej        d
ej        de	d	z  dej        d	z  dej        e	z  f
dZ
 xZS )ArcticModelr,   r2   vllm_configr2   c                   t                                                       |j        j        |j        |j        j        | _        t          | j        j        | j                  | _	        t          j        fd| d          \  | _        | _        | _        j        | _        t!          j        j                  | _        t'          dgj                  | _        d S )N)org_num_embeddingsc                 *    t          |           S )Nr   )r   )r2   r   r-   r0   s    rE   <lambda>z&ArcticModel.__init__.<locals>.<lambda>  s     -l6   rF   z.layersr   r   rI   )r9   r:   model_config	hf_configr   r0   
vocab_sizer   r;   embed_tokensr'   num_hidden_layersstart_layer	end_layerlayers_attn_implementationr   r   normr&   make_empty_intermediate_tensors)rC   r   r2   r   r-   r0   rD   s      @@@rE   r:   zArcticModel.__init__|  s   )3"/"/ +2OV/DO
 
 
 9D$      %%%9
 9
 9
5$.$+ %+$?!F.F4GHHH	/Vv10
 0
,,,rF   	input_idsry   c                 ,    |                      |          S rH   )r   rC   r   s     rE   embed_input_idszArcticModel.embed_input_ids  s      +++rF   Nr   intermediate_tensorsinputs_embedsc                 J   t                      j        r||}n"|                     |          }n|J |d         }t          | j        | j        | j                  D ]} |||          }t                      j        st          d|i          S | 	                    |          }|S )NrI   )
r
   is_first_rankr   r   r   r   r   is_last_rankr   r   )rC   r   r   r   r   rI   layers          rE   rL   zArcticModel.forward  s     >>' 	B( - $ 4 4Y ? ?'3330AMDK)94>JJ 	< 	<E!E)];;MM~~* 	I&'GHHH		-00rF   rH   )rM   rN   rO   r	   rR   r:   re   r   r   r   rL   rS   rT   s   @rE   r   r   z  s        AC 
 
 
z 
3 
 
 
 
 
 
0, ,%, , , , , .2 < < 2D8	
 |d* 
+	+       rF   r   c                   &    e Zd Zdg diZdddedef fdZdej        d	ej        fd
Z		 	 ddej        dej        de
dz  dej        dz  d	ej        e
z  f
dZdej        d	ej        dz  fdZdeeeej        f                  d	ee         fdZ xZS )ArcticForCausalLMr   )q_projk_projv_projr,   r   r   r2   c          	         t                                                       |j        j        }|j        }|| _        t          |t          |d                    | _        |j	        | _	        t          | j	        |j        |t          |d                    | _        | j        j        r| j        j        j        | j        _        |j        | _        |j        | _        t'          |j	                  | _        | j        j        | _        d S )Nmodel)r   r2   lm_headr   )r9   r:   r   r   r0   r-   r   r(   r   r   r   r;   r   tie_word_embeddingsr   weightr^   r_   ra   r   logits_processorr   )rC   r   r2   r-   r0   rD   s        rE   r:   zArcticForCausalLM.__init__  s    )3"/ #L,I,I
 
 

 !+%O%	22	
 
 
 ;* 	A"&*"9"@DL!3#)#=  /0A B BJ6 	,,,rF   r   ry   c                 6    | j                             |          S rH   )r   r   r   s     rE   r   z!ArcticForCausalLM.embed_input_ids  s    z)))444rF   Nr   r   r   c                 6    |                      ||||          }|S rH   )r   )rC   r   r   r   r   rI   s         rE   rL   zArcticForCausalLM.forward  s)     

y"6
 
 rF   rI   c                 <    |                      | j        |          }|S rH   )r   r   )rC   rI   logitss      rE   compute_logitsz ArcticForCausalLM.compute_logits  s      &&t|]CCrF   weightsc                    g d}g }g }| j         j        }t          |          D ]}|                    d| dd| ddf           |                    d| dd| ddf           |dz  dk    rA|                    d| d	d| d
df           |                    d| d	d| ddf           t          | j         j                  D ]V}|                    dd| d|f           |                    dd| d|f           |                    dd| d|f           Wt          |                                           }t                      }	t          	                    d           |D ]\  }
}|D ]i\  }}}||
vr|

                    ||          }
|
                    d          r|
|vr;t          |
|           rL||
         }|j        } ||||            n|D ]O\  }}}||
vr|

                    ||          }
t          |
|           r2||
         }|j        } ||||            n|D ]Q\  }}}||
vr|

                    ||          }
t          |
|           r2||
         }|j        } |||||            nW|
                    d          r|
|vr3t          |
|           rE||
         }t          |dt                    } |||           |	                    |
           |	S )N))r   r   r   )r   r   r   )r   r   r   zlayers.z.residual_mlp.w13.weightz.residual_mlp.w1.weightr   z.residual_mlp.w3.weightr!   r4   z .block_sparse_moe.mlp.w13.weightz.block_sparse_moe.mlp.w1.weightz.block_sparse_moe.mlp.w3.weightrl   zexperts.z
.w1.weightrm   z
.w2.weightz
.w3.weightzIt will take ~10 minutes loading from the 16-bit weights. Alternatively, use the prequantized 8-bit weights of arctic and set load-format to `sharded_state` will accelerate loading.z.bias)r.   r]   )r-   r   rangeappendr^   dictnamed_parameterssetloggerinforeplacert   r%   r]   getattrr   add)rC   r   stacked_params_mappingmlp_params_mappingexpert_params_mapping
num_layersr   r.   params_dictloaded_paramsnamero   
param_namerp   shard_idrn   r]   s                    rE   load_weightszArcticForCausalLM.load_weights  su   "
 "
 "
 :<<>[2
:&& *	 *	E%%=e===<e<<<   %%=e===<e<<<   qyA~~"))I%IIIH%HHH   #))I%IIIH%HHH    "'t{'D!E!E 	 	I)00?)???K   *00 @9 @ @ @)L   *00?)???K   	 4002233"%%%N	
 	
 	

 $+ 1	$ 1	$D-5K /< /<1
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H===9K !< !<5JX"$.. <<Z@@D.tT:: ! '-E$)$7M!M%AAAE=R < <9
K&d22$#||KDD24>> %$ +D 1(-(;%!=+    ==11 %d+6M6M$24>> %$ +D 1(/!?4I) ) &e];;;d####rF   )NN)rM   rN   rO   packed_modules_mappingr	   rR   r:   re   r   r   r   rL   r   r   tupler   r  rS   rT   s   @rE   r   r     sj       (*H*H*HIAC 
 
 
z 
3 
 
 
 
 
 
25 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
| 
	   rHU33D-E$F r3s8 r r r r r r r rrF   r   )Fr   collections.abcr   	itertoolsr   re   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.model_executor.utilsr   vllm.platformsr   vllm.sequencer   &vllm.transformers_utils.configs.arcticr    
interfacesr"   r#   utilsr$   r%   r&   r'   r(   rM   r   Moduler*   rV   r   r   r   r    rF   rE   <module>r     s   - , $ $ $ $ $ $              * * * * * * = = = = = = / / / / / / / /            $ # # # # # < < < < < < J J J J J J J J 8 8 8 8 8 8            H G G G G G F F F F F F @ @ @ @ @ @        P O O O O O 6 6 6 6 6 6 + + + + + + - - - - - - ? ? ? ? ? ? 1 1 1 1 1 1 1 1              
X		, , , , ,	 , , ,^|# |# |# |# |#	 |# |# |#~J J J J Jbi J J JZE E E E E E E EP 0 0 0 0 0") 0 0 0fd d d d d	:} d d d d drF   