
    .`i7                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.  G d dej/                  Z0 G d dej/                  Z1 G d dej/                  Z2e G d dej/                              Z3 G d dej/        e(e'          Z4dS ) z>Inference-only OLMo model compatible with HuggingFace weights.    )Iterable)isliceN)nn)
OlmoConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc            	       z     e Zd ZdZ	 	 	 ddededz  dedz  def fdZd	e	j
        d
e	j
        de	j
        fdZ xZS )OlmoAttentionz
    This is the attention block where the output is computed as
    `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    N configcache_configquant_configprefixc           	         t                                                       || _        |j        | _        t	                      }|j        | _        | j        | j        z  dk    sJ | j        |z  dk    sJ | j        |z  | _        | j        | j        z  | _        |j	        | _	        |j
        | _
        t          | j        | j        | j        |j        || d          | _        t          | j        | j	        |j                  | _        | j        dz  | _        t%          | j        | j        | j        ||| d          | _        t)          | j        | j        |j        || d          | _        d S )	Nr   z	.qkv_projbiasr%   r&   )max_positionrope_parametersg      z.attn)scaler$   r%   r&   z.o_proj)super__init__r#   hidden_sizer   num_attention_headstotal_num_heads	num_headshead_dimmax_position_embeddingsclip_qkvr   attention_biasqkv_projr   r+   
rotary_embscalingr   attnr   o_proj)selfr#   r$   r%   r&    tensor_model_parallel_world_size	__class__s         s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/olmo.pyr.   zOlmoAttention.__init__G   s    	!-+O+Q+Q(%9$"66!;;;;#&FF!KKKK-1QQ(D,@@'-'E$ *M &%'''
 
 
 #M5"2
 
 

 }d*NM,%%###
 
 
	 (&%%%%
 
 
    	positionshidden_statesreturnc                 P   |                      |          \  }}| j        "|                    | j         | j                   |                    dd          \  }}}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )N)minmax   )chunksdim)r7   r5   clamp_chunkr8   r:   r;   )
r<   rA   rB   qkv_qkvattn_outputoutputs
             r?   forwardzOlmoAttention.forward   s    
 }--Q=$JJDM>t}J===))1")--1ay!Q//1ii1a((KK,,	r@   NNr"   )__name__
__module____qualname____doc__r   r	   r   strr.   torchTensorrT   __classcell__r>   s   @r?   r!   r!   @   s          ,0266
 6
6
 "D(6
 )4/	6

 6
 6
 6
 6
 6
 6
p< | 
	       r@   r!   c                   `     e Zd ZdZ	 	 ddededz  def fdZdej	        d	ej	        fd
Z
 xZS )OlmoMLPz
    This is the MLP block where the output is computed as
    `MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    Nr"   r#   r%   r&   c                 L   t                                                       || _        |j        | _        |j        | _        t          | j        | j        gdz  d|| d          | _        t                      | _        t          | j        | j        d|| d          | _
        d S )N   Fz.gate_up_projr(   z
.down_proj)r-   r.   r#   r/   intermediate_sizer   gate_up_projr   act_fnr   	down_proj)r<   r#   r%   r&   r>   s       r?   r.   zOlmoMLP.__init__   s     	!-!'!9 7#$q(%+++
 
 
 !ll +"%(((
 
 
r@   xrC   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)rd   re   rf   )r<   rg   gate_uprN   s       r?   rT   zOlmoMLP.forward   sF     &&q))
KK  ~~a  1r@   )Nr"   )rV   rW   rX   rY   r   r   rZ   r.   r[   r\   rT   r]   r^   s   @r?   r`   r`      s          37	
 

 )4/
 	
 
 
 
 
 
@< 
       r@   r`   c            
            e Zd ZdZ	 	 	 ddededz  dedz  def fdZd	e	j
        d
e	j
        dee	j
        ee	j
        e	j
        f         dz  f         fdZ xZS )OlmoDecoderLayerz
    This is a typical transformer block where the output is
    computed as `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    Nr"   r#   r$   r%   r&   c                 6   t                                                       t          |||| d          | _        t	          ||| d          | _        t          j        |j        dd          | _	        t          j        |j        dd          | _
        d S )Nz
.self_attnr&   z.mlpFelementwise_affiner)   )r-   r.   r!   	self_attnr`   mlpr   	LayerNormr/   input_layernormpost_attention_layernorm)r<   r#   r$   r%   r&   r>   s        r?   r.   zOlmoDecoderLayer.__init__   s     	&L,&7L7L7L
 
 

 6<6HHH  "|5u 
  
  
 )+5u)
 )
 )
%%%r@   rA   rB   rC   c                     |}|                      |          }|                     ||          }||z   }|}|                     |          }|                     |          }||z   }|S ri   )rt   rq   ru   rr   )r<   rA   rB   residuals       r?   rT   zOlmoDecoderLayer.forward   st     !,,];;y-@@%0 !55mDD// =0r@   rU   )rV   rW   rX   rY   r   r	   r   rZ   r.   r[   r\   tuplerT   r]   r^   s   @r?   rl   rl      s          ,026
 

 "D(
 )4/	

 
 
 
 
 
 
0< | 
u|U5<#=>EE	F	       r@   rl   c                        e Zd Zdddedef fdZdej        dej        fdZ	 ddej        d
ej        de	d	z  dej        d	z  dej        e	z  f
dZ
deeeej        f                  dee         fdZ xZS )	OlmoModelr"   rn   vllm_configr&   c                   t                                                       |j        j        |j        |j        | _        t          j        j	                  | _
        t          j        fd| d          \  | _        | _        | _        t!          j        j	        dd          | _        t'          dgj	                  | _        d S )Nc                 *    t          |           S )Nrn   )rl   )r&   r$   r#   r%   s    r?   <lambda>z$OlmoModel.__init__.<locals>.<lambda>  s     +l6   r@   z.layersrn   Fro   rB   )r-   r.   model_config	hf_configr$   r%   r#   r   
vocab_sizer/   embed_tokensr   num_hidden_layersstart_layer	end_layerlayersr   rs   normr   make_empty_intermediate_tensors)r<   r{   r&   r$   r#   r%   r>   s      @@@r?   r.   zOlmoModel.__init__   s    )3"/"/2v1
 
 9D$      %%%9
 9
 9
5$.$+ L5u
 
 
	 0Wv10
 0
,,,r@   	input_idsrC   c                 ,    |                      |          S ri   )r   r<   r   s     r?   embed_input_idszOlmoModel.embed_input_ids  s      +++r@   NrA   intermediate_tensorsinputs_embedsc                 J   t                      j        r||}n"|                     |          }n|J |d         }t          | j        | j        | j                  D ]} |||          }t                      j        st          d|i          S | 	                    |          }|S )zN
        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
        NrB   )
r   is_first_rankr   r   r   r   r   is_last_rankr   r   )r<   r   rA   r   r   rB   layers          r?   rT   zOlmoModel.forward  s     >>' 	B( - $ 4 4Y ? ?'3330AM DK)94>JJ 	< 	<E!E)];;MM~~* 	I&'GHHH 		-00r@   weightsc                 ,   g d}t          |                     d                    }t                      }|D ]\  }}|D ]i\  }}}	||vr|                    ||          }|                    d          r||vr;t          ||           rL||         }
|
j        } ||
||	            nU|                    d          r||vrt          ||           r||         }
t          |
dt                    } ||
|           |	                    |           |S )N))r7   q_projrO   )r7   k_projrP   )r7   v_projrQ   )rd   	gate_projr   )rd   up_projr   F)remove_duplicatez.biasweight_loader)
dictnamed_parameterssetreplaceendswithr   r   getattrr   add)r<   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   s               r?   load_weightszOlmoModel.load_weights/  sl   "
 "
 "
 400%0HHII"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<==)) d+.E.E*466 #D) % 3e]H=== ==)) d+.E.E*466 #D) '@U V Ve]333d####r@   ri   )rV   rW   rX   r
   rZ   r.   r[   r\   r   r   rT   r   rx   r   r   r]   r^   s   @r?   rz   rz      s       AC 
 
 
z 
3 
 
 
 
 
 
4, ,%, , , , , .2 < < 2D8	
 |d* 
+	+   >#HU33D-E$F #3s8 # # # # # # # #r@   rz   c                   0    e Zd ZdZg dddgdZddded	ef fd
Zdej	        dej	        fdZ
	 	 ddej	        dej	        dedz  dej	        dz  dej	        ez  f
dZdej	        dej	        dz  fdZdeeeej	        f                  dee         fdZ xZS )OlmoForCausalLMz/
    Extremely barebones HF model wrapper.
    )r   r   r   r   r   )r7   rd   r"   rn   r{   r&   c          	         t                                                       |j        j        }|j        }|| _        t          |t          |d                    | _        |j	        r| j        j
        | _        n0t          |j        |j        |t          |d                    | _        t          |j                  | _        | j        j        | _        d S )Nmodel)r{   r&   lm_head)r%   r&   )r-   r.   r   r   r%   r#   rz   r   r   tie_word_embeddingsr   r   r   r   r/   r   logits_processorr   )r<   r{   r&   r#   r%   r>   s        r?   r.   zOlmoForCausalLM.__init__f  s    )3"/#L,I,I
 
 

 % 	:2DLL)!")#FI66	  DL !00A B BJ6 	,,,r@   r   rC   c                 6    | j                             |          S ri   )r   r   r   s     r?   r   zOlmoForCausalLM.embed_input_ids|  s    z)))444r@   NrA   r   r   c                 8    |                      ||||          }|S )N)r   rA   r   r   )r   )r<   r   rA   r   r   rB   s         r?   rT   zOlmoForCausalLM.forward  s0     

!5'	 # 
 
 r@   rB   c                 <    |                      | j        |          }|S ri   )r   r   )r<   rB   logitss      r?   compute_logitszOlmoForCausalLM.compute_logits  s      &&t|]CCr@   r   c                 l    t          | | j        j        rdgnd           }|                    |          S )Nzlm_head.weight)skip_prefixes)r   r#   r   r   )r<   r   loaders      r?   r   zOlmoForCausalLM.load_weights  sE    "&*k&EO!""4
 
 
 ""7+++r@   )NN)rV   rW   rX   rY   packed_modules_mappingr
   rZ   r.   r[   r\   r   r   rT   r   r   rx   r   r   r]   r^   s   @r?   r   r   U  s        

 
 
 

 
 BD 
 
 
z 
3 
 
 
 
 
 
,5 5%, 5 5 5 5 <@-1 < < 2D8	
 |d* 
+	+   | 
	   ,HU33D-E$F ,3s8 , , , , , , , ,r@   r   )5rY   collections.abcr   	itertoolsr   r[   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r   Moduler!   r`   rl   rz   r    r@   r?   <module>r      s  2 E D $ $ $ $ $ $              # # # # # # * * * * * * = = = = = = / / / / / / / / O O O O O O O O < < < < < <         
 H G G G G G F F F F F F @ @ @ @ @ @        P O O O O O - - - - - - 0 0 0 0 0 0 0 0             K K K K KBI K K K\. . . . .bi . . .b/ / / / /ry / / /d ` ` ` ` `	 ` ` `FG, G, G, G, G,bi\ G, G, G, G, G,r@   