
    .`iX              	          d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<  G d dej=                  Z> G d dej=                  Z? G d dej=                  Z@	 d*d ZA eeA!           G d" d#ej=                              ZB G d$ d%ej=        e3e4e1e2          ZC G d& d' e/eC                    ZD G d( d) e.eC                    ZEdS )+z?Inference-only LLaMA model compatible with HuggingFace weights.    )Iterable)isliceN)nn)LlamaConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)EncoderOnlyAttention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)AttentionType   )as_embedding_modelas_seq_cls_model)SupportsEagleSupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                   \     e Zd Z	 	 	 	 	 ddededededz  d	ed
edededdf fdZd Z xZ	S )LlamaMLPNF Thidden_sizeintermediate_size
hidden_actquant_configbiasprefixreduce_results
disable_tpreturnc	           
         t                                                       t          ||gdz  |||| d          | _        t	          ||||||| d          | _        |dk    rt          d| d          t                      | _        d S )	N   .gate_up_proj)
input_sizeoutput_sizesr1   r0   r4   r2   z
.down_proj)r9   output_sizer1   r0   r3   r4   r2   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)
selfr-   r.   r/   r0   r1   r2   r3   r4   	__class__s
            t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/llama.pyr>   zLlamaMLP.__init__Q   s     	6"+,q0%!+++
 
 
 +(#%)!(((
 
 
 X:XXX   !ll    c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r?   rB   r@   )rC   x_s      rE   forwardzLlamaMLP.forwardt   sB      ##1KKNN~~a  1rF   )NFr,   TF)
__name__
__module____qualname__intstrr   boolr>   rK   __classcell__rD   s   @rE   r+   r+   P   s         37# !# !#!# !# 	!#
 )4/!# !# !# !# !# 
!# !# !# !# !# !#F      rF   r+   c                        e Zd Zddddddej        fdedededed	ed
edz  dedede	dz  de
de
ddf fdZdej        dej        dej        fdZded
edz  ddfdZ xZS )LlamaAttention    NFr,   configr-   	num_headsnum_kv_headsmax_position_embeddingsr0   r1   bias_o_projcache_configr2   	attn_typer5   c                    t                                                       t          |
          }|| _        t	                      }|| _        | j        |z  dk    sJ | j        |z  | _        || _        | j        |k    r| j        |z  dk    sJ n|| j        z  dk    sJ t          d| j        |z            | _	        t          |dd           }|p| j        | j        z  | _        | j        | j        z  | _        | j	        | j        z  | _        | j        dz  | _        || _        t!          || j        | j        | j        |||
 d          | _        t%          | j        | j        z  ||||
 d          | _        |                     ||	           d }t          |d
d           x}rUt+          |d          r||j        z
  }n|}|t/          |          k     sJ d| d|             ||         dk    }|r|j        }|t2          j        k    rt6          nt8          } || j        | j        | j        | j	        |	||||
 d	  	        | _        d S )Nr   r   head_dimg      	.qkv_proj)r-   	head_sizetotal_num_headstotal_num_kv_headsr1   r0   r2   z.o_proj)r9   r;   r1   r0   r2   r0   layer_typestarget_layer_countzeffective_layer_idx: z# is out of bounds for layer_types: sliding_attentionz.attn)rY   r\   r0   per_layer_sliding_windowr]   r2   )r=   r>   r%   r-   r   rb   rX   rc   maxrY   getattrr_   q_sizekv_sizescalingrZ   r   qkv_projr   o_proj_init_rotary_embhasattrrf   lensliding_windowr   ENCODER_ONLYr   r   attn)rC   rW   r-   rX   rY   rZ   r0   r1   r[   r\   r2   r]   	layer_idxtp_sizer_   rs   re   effective_layer_idx
is_slidingattn_clsrD   s                       rE   r>   zLlamaAttention.__init__|   s    	'//	&688(#g-2222-8"."g-- *W499999 T4499994#:g#EFF6:t44 LD$48L$Lnt}4(4=8}d*'>$)#m 0#6%'''
 
 
 (+dm;#%%%%
 
 
 	f<@@@!&->>>; 	7 v344 0 '0&2K&K## '0#&[)9)9999C(; C C5@C C :99
 %%89=PPJ 7!'!6 M666 !  	 HNML*%%%3###

 

 

			rF   	positionshidden_statesc                 "   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )N)dim)rn   splitrk   rl   
rotary_embru   ro   )
rC   r{   r|   qkvrJ   qkvattn_outputoutputs
             rE   rK   zLlamaAttention.forward   s    
 }--Q))T[$,E2)NN1ay!Q//1ii1a((KK,,	rF   c           	          d}|o|                                 dk    }|r|j        dk    rd}t          | j        | j        t          |dd           |          | _        d S )NTggufllamaFrope_parameters)max_positionr   is_neox_style)get_name
model_typer   r_   rZ   rj   r   )rC   rW   r0   r   is_ggufs        rE   rp   zLlamaAttention._init_rotary_emb   sv    
 D<#8#8#:#:f#D 	"v(G33!M"M5#F,=tDD'	
 
 
rF   )rL   rM   rN   r   DECODERr   rO   r   rQ   r	   rP   r>   torchTensorrK   rp   rR   rS   s   @rE   rU   rU   {   s[        (,26!+/&.`
 `
`
 `
 	`

 `
 "%`
 )4/`
 `
 `
 "D(`
 `
 `
 
`
 `
 `
 `
 `
 `
D
<
 |
 
	
 
 
 


 )4/
 
	
 
 
 
 
 
 
 
rF   rU   c                        e Zd Zddefdedededz  deej	                 ddf
 fdZ
d	ej        d
ej        dej        dz  deej        ej        f         fdZdededz  fdZ xZS )LlamaDecoderLayerr,   Nvllm_configr2   rW   attn_layer_typer5   c                    t                                                       |p|j        j        }|j        }|                     |          }|j        | _        t          |dd          }t          |dd          pt          |dd          }|}	t          |d          r|j	        }t          |dd          rt          j        }
nt          j        }
 ||| j        |j        t          |d	|j                  ||||	|| d
|
          | _        t          | j        |j        |j        |t          |dd          | d          | _        t'          |j        |j                  | _        t'          |j        |j                  | _        d S )NrZ   rV   attention_biasFr1   qkv_bias	is_causalTnum_key_value_headsz
.self_attn)rW   r-   rX   rY   rZ   r0   r1   r[   r\   r2   r]   mlp_biasz.mlp)r-   r.   r/   r0   r1   r2   eps)r=   r>   model_config	hf_configr\   get_quant_configr-   rj   rq   r   r   r   rt   num_attention_heads	self_attnr+   r.   r/   mlpr   rms_norm_epsinput_layernormpost_attention_layernorm)rC   r   r2   rW   r   r\   r0   rZ   r   r[   r]   rD   s              rE   r>   zLlamaDecoderLayer.__init__   s    	=;3="/,,[99!-")&2KT"R"R !)95AA 
WFEF
 F
 %6:&& 	-#_N 6;-- 	3%-II%2I((0 -v/I  %<%#%(((
 
 
 ($6(%U33???
 
 
  'v'9v?RSSS(/F$7)
 )
 )
%%%rF   r{   r|   residualc                     ||}|                      |          }n|                      ||          \  }}|                     ||          }|                     ||          \  }}|                     |          }||fS )N)r{   r|   )r   r   r   r   )rC   r{   r|   r   s       rE   rK   zLlamaDecoderLayer.forward;  s     $H 00??MM&*&:&:=(&S&S#M8-XX #'"?"?x"X"Xx//h&&rF   c                     |j         S )z?Get quantization config for this layer. Override in subclasses.rd   )rC   r   s     rE   r   z"LlamaDecoderLayer.get_quant_configN  s    ''rF   )rL   rM   rN   rU   r
   rP   r   typer   Moduler>   r   r   tuplerK   r   r   rR   rS   s   @rE   r   r      s        %)+9<
 <
<
 <
 d"	<

 bi<
 
<
 <
 <
 <
 <
 <
|'<' |' ,%	'
 
u|U\)	*' ' ' '&(J (;MPT;T ( ( ( ( ( ( ( (rF   r   c                     | Jt          j        |                                d         |                                 d         k               dS dS )zShape invariants for Llama model compilation, those are translated to
    runtime assertions for unbacked dynamic shapes and are compiled away for
    backedNr   )r   _checksize)	input_idsr{   intermediate_tensorsinputs_embedss       rE   llama_model_invariantsr   S  sK     Y^^%%a(INN,<,<Q,??@@@@@ rF   )shape_invariantsc                   F    e Zd Zdeddededeej                 f fdZ	de
j        de
j        fd	Z	 dde
j        d
z  de
j        ded
z  de
j        d
z  de
j        ez  ee
j        ee
j                 f         z  f
dZdeeee
j        f                  dee         fdZ xZS )
LlamaModelr,   r2   
layer_typer   r2   r   c                   t                                                       j        j        }j        }|| _        || _        |j        | _        t                      j        s|j	        r5t                      j
        r"t          | j        |j        |          | _        nt                      | _        t          |j        fd| d          \  | _        | _        | _        t                      j
        r!t)          |j        |j                  | _        nt                      | _        t/          t0          df                     | _        t5          ddg|j                  | _        d S )	Nrd   c                      |           S )N)r   r2    )r2   r   r   s    rE   <lambda>z%LlamaModel.__init__.<locals>.<lambda>  s    ::+fMMM rF   z.layers)r2   r   .r|   r   )r=   r>   r   r   r0   rW   
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr   r-   embed_tokensr$   r(   num_hidden_layersstart_layer	end_layerlayersr   r   normr   rO   aux_hidden_state_layersr'   make_empty_intermediate_tensors)rC   r   r2   r   rW   r0   rD   s    ` `  rE   r>   zLlamaModel.__init__c  sd    	)3"/( +>>' 		1&		1+7>>+F		1 !7")! ! !D !/ 0 0D8C$MMMMM%%%9
 9
 9
5$.$+
 >>& 	) 28KLLLDII&((DI',S#X'8'8$/Vj)6+=0
 0
,,,rF   r   r5   c                 ,    |                      |          S rH   )r   rC   r   s     rE   embed_input_idszLlamaModel.embed_input_ids  s      +++rF   Nr{   r   r   c                     t                      j        r||}n|                     |          }d }n|J |d         }|d         }g }t          t	          | j        | j        | j                            D ]4\  }	}
|	| j        v r|	                    ||z               |
|||fi |\  }}5t                      j
        st          ||d          S |                     ||          \  }}t          |          dk    r||fS |S )Nr|   r   )r|   r   r   )r   r   r   	enumerater   r   r   r   r   appendr   r   r   rr   )rC   r   r{   r   r   extra_layer_kwargsr|   r   aux_hidden_statesidxlayerrJ   s               rE   rK   zLlamaModel.forward  sV    >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7H#4; 0$.AA
 
 	 	JC d222!(()ABBB&+e=(' '6H' '#M88 ~~* 	&"/XFF    99]H==q !!A%% "333rF   weightsc                    g d}t          |                                           }t                      }|D ]\  }}d|v rd|v sd|v r| j        ~| j                            |          x}rb||         }t          |dt                    }	|                                dk    r|n|d         } |	||           |                    |           d|v sd|v rt          ||          }||D ]i\  }
}}||vr|
                    ||
          }|                    d	          r||vr;t          ||           rL||         }|j        }	 |	|||            nW|                    d	          r||vr;t          ||           rM||         }t          |dt                    }	 |	||           |                    |           |S )
N))r`   z.q_projr   )r`   z.k_projr   )r`   z.v_projr   )r8   z
.gate_projr   )r8   z.up_projr   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedweight_loaderr   scale
zero_pointz.bias)dictnamed_parameterssetr0   get_cache_scalerj   r   r   addr   replaceendswithr&   r   )rC   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_ids                rE   load_weightszLlamaModel.load_weights  sR   "
 "
 "
 4002233"%%%#* 2	$ 2	$D-$,,&$..2IT2Q2Q  ,"/??EEE
 - $J/ '@U V V%2%6%6%8%8A%=%=MM=QRCS  e]333!!*---$,$"6"60{CC<5K 4 41
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H=== ==)) d+.E.E*466 #D) '@U V Ve]333d####rF   rH   )rL   rM   rN   r   r
   rP   r   r   r   r>   r   r   r   r   r   listrK   r   r   r   rR   rS   s   @rE   r   r   ]  sU        &7)
 )
 )
  )
 	)

 O)
 )
 )
 )
 )
 )
V, ,%, , , , , .2& &<$&& <& 2D8	&
 |d*& 
+	+eEL$u|BT4T.U	U& & & &P>HU33D-E$F >3s8 > > > > > > > >rF   r   c                       e Zd Zg dddgdZdddZded	d
ededee	j
                 f fdZdeedf         ddfdZdeedf         fdZdefd
ededee	j
                 fdZdej        dej        fdZ	 	 ddej        dej        dedz  dej        dz  dej        ez  f
dZdej        dej        dz  fdZdeeeej        f                  dee         fdZ xZS ) LlamaForCausalLM)q_projk_projv_proj	gate_projup_proj)rn   r?   input_embeddingsoutput_embeddings)r   lm_headr,   r   r   r2   r   c          	      Z   t                                                       |j        j        }|j        }|| _        |                     |t          |d          |          | _        t                      j
        rt          |j        |j        |t          |d                    | _        |j        r)| j                            | j        j                  | _        t%          |dd          }t'          |j        |          | _        nt+                      | _        | j        j        | _        d S )Nmodelr   r2   r   r   )r0   r2   logit_scaleg      ?)r   )r=   r>   r   r   r0   rW   _init_modelr)   r   r   r   r   r   r-   r   r   tie_weightsr   rj   r   logits_processorr$   r   )rC   r   r2   r   rW   r0   r   rD   s          rE   r>   zLlamaForCausalLM.__init__  s+    	)3"/%%#00! & 
 

 >>& 	,)!")#FI66	  DL ) Q#|77
8OPP!&-==K$3!% % %D!! *++DL J6 	,,,rF   r   .r5   Nc                     || j         _        d S rH   )r   r   )rC   r   s     rE   set_aux_hidden_state_layersz,LlamaForCausalLM.set_aux_hidden_state_layers/  s    -3
***rF   c                 J    t          | j        j                  }d|dz  |dz
  fS )zOverride to return default layers for Llama

        Note: The GPU model runner will override this with layers from
        the speculative config if available, providing dynamic configuration.
        r7      )rr   r   r   )rC   
num_layerss     rE   "get_eagle3_aux_hidden_state_layersz3LlamaForCausalLM.get_eagle3_aux_hidden_state_layers2  s+     *++
:?JN33rF   c                 &    t          |||          S )Nr   )r   )rC   r   r2   r   s       rE   r   zLlamaForCausalLM._init_model;  s     k&ZXXXXrF   r   c                 6    | j                             |          S rH   )r   r   r   s     rE   r   z LlamaForCausalLM.embed_input_idsC  s    z)))444rF   r{   r   r   c                 6    |                      ||||          }|S rH   )r   )rC   r   r{   r   r   model_outputs         rE   rK   zLlamaForCausalLM.forwardF  s)     zzy"6
 
 rF   r|   c                 <    |                      | j        |          }|S rH   )r   r   )rC   r|   logitss      rE   compute_logitszLlamaForCausalLM.compute_logitsR  s      &&t|]CCrF   r   c                 l    t          | | j        j        rdgnd           }|                    |          S )Nzlm_head.)skip_prefixes)r#   rW   r   r   )rC   r   loaders      rE   r   zLlamaForCausalLM.load_weightsY  sC    "+/;+JTJ<<PT
 
 
 ""7+++rF   NN)rL   rM   rN   packed_modules_mappingembedding_modulesr   r
   rP   r   r   r   r>   r   rO   r   r  r   r   r   r   r   rK   r
  r   r   r   rR   rS   s   @rE   r   r     s7        322$i0  +&  &7%
 %
 %
  %
 	%

 O%
 %
 %
 %
 %
 %
N4%S/ 4d 4 4 4 44E#s(O 4 4 4 4 &7	Y YY Y O	Y Y Y Y5 5%, 5 5 5 5 <@-1
 
<
 <
 2D8	

 |d*
 
+	+
 
 
 
| 
	   ,HU33D-E$F ,3s8 , , , , , , , ,rF   r   c                       e Zd ZdS )+LlamaBidirectionalForSequenceClassificationNrL   rM   rN   r   rF   rE   r  r  a           	DrF   r  c                       e Zd ZdS )LlamaBidirectionalModelNr  r   rF   rE   r  r  g  r  rF   r  r  )F__doc__collections.abcr   	itertoolsr   r   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   %vllm.model_executor.layers.activationr   ;vllm.model_executor.layers.attention.encoder_only_attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.v1.attention.backendr   adaptersr   r   
interfacesr   r    r!   r"   utilsr#   r$   r%   r&   r'   r(   r)   r   r+   rU   r   r   r   r   r  r  r   rF   rE   <module>r-     sM  2 F E $ $ $ $ $ $              $ $ $ $ $ $ * * * * * * = = = = = = / / / / / / / / O O O O O O O O < < < < < <      9 8 8 8 8 8         
 H G G G G G F F F F F F @ @ @ @ @ @               . - - - - - 3 3 3 3 3 3 : : : : : : : :                            ( ( ( ( (ry ( ( (V~
 ~
 ~
 ~
 ~
RY ~
 ~
 ~
BT( T( T( T( T(	 T( T( T(p DHA A A A  ,  
U U U U U U U 
Upd, d, d, d, d,I|Zd, d, d,N	 	 	 	 	2B2BCS2T2T 	 	 		 	 	 	 	001ABB 	 	 	 	 	rF   