
    .`i@                         d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8  G d dej9                  Z: G d dej9                  Z; G d dej9                  Z<e G d  d!ej9                              Z= G d" d#ej9        e-e,          Z>dS )$z?Inference-only OLMo2 model compatible with HuggingFace weights.    )Iterable)partial)isliceN)nn)Olmo2Config)	Attention)support_torch_compile)
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size) tensor_model_parallel_all_gather)get_tensor_model_parallel_rank)split_tensor_along_last_dim)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)SupportsLoRA
SupportsPP)AutoWeightsLoaderextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)IntermediateTensors)Olmo3Configc                        e Zd ZdZdddedef fdZdej        dej        d	e	ej        ej        f         fd
Z
dej        dej        d	ej        fdZ xZS )Olmo2Attentionz
    This is the attention block where the output is computed as
    `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
     prefixvllm_configr(   c                :   t                                                       |j        j        | _        t          | j        t          t          f          sJ | j        j        }t                      | _
        | j        j        | _        || j        z  dk    sJ | j        | j
        z  dk    sJ | j        | j
        z  | _        | j        j        p| j        | _        | j        | j
        k    r| j        | j
        z  dk    sJ n| j
        | j        z  dk    sJ t!          d| j        | j
        z            | _        || j        z  | _        | j        | j        z  | _        | j        | j        z  | _        | j        j        | _        t-          || j        | j        | j        d|j        | d          | _        t3                      | _        t7          | j        | j        z  | j        j                  | _        t7          | j        j        | j        j                  | _        | j        dz  | _        tA          |          }d }tC          | j        dd           x}	 ||         d	k    r| j        j"        }tG          | j        | j        | j        | j        |j$        |j        || d
          | _%        || j        j&        }n| j        j&        d         }d|d}tO          | j        | j        |          | _(        tS          | j        | j        z  |d|j        | d          | _*        d S )Nr      Fz	.qkv_projbiasquant_configr(   epsg      layer_typessliding_attentionz.attn)num_kv_headscache_configr.   per_layer_sliding_windowr(   
rope_thetadefault)	rope_typer6   )max_positionrope_parametersz.o_proj)+super__init__model_config	hf_configconfig
isinstancer   r#   hidden_sizer   tp_sizenum_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr3   head_dimq_sizekv_sizemax_position_embeddingsr   r.   qkv_projr   tp_rankr   rms_norm_epsk_normq_normscalingr   getattrsliding_windowr   r4   attnr:   r   
rotary_embr   o_proj)
selfr)   r(   rA   	layer_idxrT   r1   r:   r6   	__class__s
            t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/olmo2.pyr<   zOlmo2Attention.__init__L   s7   !.8$+['ABBBBBk-;==#{>T11Q6666#dl2a7777-=K+Ct/C 	 "dl22*T\9Q>>>>><$"99Q>>>>4#:dl#JKK#t';;nt}4(4=8'+{'J$ *M #$1'''
 
 
 677#dm3(
 
 
 dk54;;STTT}d*'//	"4;tDDDK%i04GGG![7NNML*$1$1%3###	
 	
 	
	 !"k9OO4\BJ,5ZPPO"M5+
 
 
 ( 4=0$1%%%
 
 
    qkreturnc                    | j         dk    rBt          |                                          }t          |                                          }|                     |          }|                     |          }| j         dk    rGt          t          | j                   } ||          | j                 } ||          | j                 }||fS )Nr+   )num_partitions)rB   r   
contiguousrQ   rP   r   r   rN   )rX   r]   r^   splitters       r[   _apply_qk_normzOlmo2Attention._apply_qk_norm   s     <!0@@A0@@AKKNNKKNN<!:4<XXXHDL)ADL)A!tr\   	positionshidden_statesc                 T   |                      |          \  }}|                    | j        | j        | j        gd          \  }}}|                     ||          \  }}|                     |||          \  }}|                     |||          }|                     |          \  }	}|	S )N)dim)rM   splitrJ   rK   rd   rV   rU   rW   )
rX   re   rf   qkv_r]   r^   vattn_outputoutputs
             r[   forwardzOlmo2Attention.forward   s    
 }--Q))T[$,E2)NN1a""1a((1y!Q//1ii1a((KK,,	r\   )__name__
__module____qualname____doc__r
   strr<   torchTensortuplerd   rp   __classcell__rZ   s   @r[   r%   r%   E   s          BD T
 T
 T
z T
3 T
 T
 T
 T
 T
 T
l"',	u|U\)	*   < | 
	       r\   r%   c                   V     e Zd ZdZdddedef fdZdej        dej        fd	Z	 xZ
S )
Olmo2MLPz
    This is the MLP block where the output is computed as
    `MLP(x)` in `LN(MLP(x + LN(Attention(x))))`
    (plus another skip connection).
    r&   r'   r)   r(   c                j   t                                                       |j        j        }t	          |t
          t          f          sJ |j        }|j        }t          ||gdz  d|j
        | d          | _        t                      | _        t          ||d|j
        | d          | _        d S )N   Fz.gate_up_projr,   z
.down_proj)r;   r<   r=   r>   r@   r   r#   rA   intermediate_sizer   r.   gate_up_projr   act_fnr   	down_proj)rX   r)   r(   r?   rA   r   rZ   s         r[   r<   zOlmo2MLP.__init__   s    )3&;"<=====("4 7!#$1+++
 
 
 !ll +$1(((
 
 
r\   xr_   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r   r   r   )rX   r   gate_uprl   s       r[   rp   zOlmo2MLP.forward   sF     &&q))
KK  ~~a  1r\   rq   rr   rs   rt   r
   ru   r<   rv   rw   rp   ry   rz   s   @r[   r|   r|      s          BD 
 
 
z 
3 
 
 
 
 
 
8< 
       r\   r|   c                   d     e Zd ZdZdddedef fdZdej        dej        d	ej        fd
Z	 xZ
S )Olmo2DecoderLayerz
    This is a typical transformer block where the output is
    computed as `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    r&   r'   r)   r(   c                   t                                                       |j        j        }t	          |t
          t          f          sJ t          || d          | _        t          || d          | _
        t          |j        |j                  | _        t          |j        |j                  | _        d S )Nz
.self_attnr)   r(   z.mlpr/   )r;   r<   r=   r>   r@   r   r#   r%   	self_attnr|   mlpr   rA   rO   post_attention_layernormpost_feedforward_layernormrX   r)   r(   r?   rZ   s       r[   r<   zOlmo2DecoderLayer.__init__   s    )3&;"<====='#v,A,A,A
 
 

 vOOOLLL )0F$7)
 )
 )
% +2F$7+
 +
 +
'''r\   re   rf   r_   c                     |}|                      ||          }|                     |          }||z   }|}|                     |          }|                     |          }||z   }|S r   )r   r   r   r   )rX   re   rf   residuals       r[   rp   zOlmo2DecoderLayer.forward  st     !y-@@55mDD%0 !//77FF =0r\   r   rz   s   @r[   r   r      s          BD 
 
 
z 
3 
 
 
 
 
 
*< | 
	       r\   r   c                        e Zd Zdddedef fdZdej        dej        fdZ	 ddej        d
ej        de	d	z  dej        d	z  dej        e	z  f
dZ
deeeej        f                  dee         fdZ xZS )
Olmo2Modelr&   r'   r)   r(   c                   t                                                       j        j        | _        t          | j        t          t          f          sJ t          | j        j	        | j        j
        | d          | _        t          | j        j        fd| d          \  | _        | _        | _        t#          | j        j
        | j        j                  | _        t)          dg| j        j
                  | _        d S )Nz.embed_tokensr'   c                 &    t          |           S )Nr   )r   )r(   r)   s    r[   <lambda>z%Olmo2Model.__init__.<locals>.<lambda>(  s    ,VTTT r\   z.layersr/   rf   )r;   r<   r=   r>   r?   r@   r   r#   r   
vocab_sizerA   embed_tokensr    num_hidden_layersstart_layer	end_layerlayersr   rO   normr   make_empty_intermediate_tensors)rX   r)   r(   rZ   s    ` r[   r<   zOlmo2Model.__init__  s   !.8$+['ABBBBB2K"K#+++
 
 

 9DK)TTTT%%%9
 9
 9
5$.$+
 K#(
 
 
	 0Wt{60
 0
,,,r\   	input_idsr_   c                 ,    |                      |          S r   )r   rX   r   s     r[   embed_input_idszOlmo2Model.embed_input_ids3  s      +++r\   Nre   intermediate_tensorsinputs_embedsc                    t                      j        r||}n>|                     |          }n(|J |d         }t          |t          j                  sJ t          | j        | j        | j	                  D ]} |||          }t                      j
        st          d|i          S |                     |          }|S )zN
        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
        Nrf   )r   is_first_rankr   r@   rv   rw   r   r   r   r   is_last_rankr"   r   )rX   r   re   r   r   rf   layers          r[   rp   zOlmo2Model.forward6  s     >>' 	;( - !% 1 1) < < (3330AMmU\::::: DK)94>JJ 	< 	<E!E)];;MM~~* 	I&'GHHH 		-00r\   weightsc                 
   g d}t          |                     d                    }t                      }|D ]\  }}t          ||           r|D ]X\  }}}	||vr|                    ||          }|                    d          r||vr;||         }
|
j        } ||
||	            nD|                    d          r||vr||         }
t          |
dt                    } ||
|           |	                    |           |S )N))rM   q_projr]   )rM   k_projr^   )rM   v_projrm   )r   	gate_projr   )r   up_projr+   F)remove_duplicatez.biasweight_loader)
dictnamed_parameterssetr   replaceendswithr   rS   r   add)rX   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   s               r[   load_weightszOlmo2Model.load_weightsZ  sW   "
 "
 "
 400%0HHII"%%%#* 	$ 	$D-&tT22 5K 4 41
Kd**||K<<==)) d+.E.E#D) % 3e]H=== ==)) d+.E.E#D) '@U V Ve]333d####r\   r   )rq   rr   rs   r
   ru   r<   rv   rw   r   r"   rp   r   rx   r   r   ry   rz   s   @r[   r   r     s       AC 
 
 
z 
3 
 
 
 
 
 
., ,%, , , , , .2" "<" <" 2D8	"
 |d*" 
+	+" " " "H"HU33D-E$F "3s8 " " " " " " " "r\   r   c                        e Zd ZdZg dddgdZddded	ef fd
Zdej	        dej	        fdZ
	 	 ddej	        dej	        dedz  dej	        dz  dej	        ez  f
dZdej	        dej	        dz  fdZdeeeej	        f                  fdZ xZS )Olmo2ForCausalLMz/
    Extremely barebones HF model wrapper.
    )r   r   r   r   r   )rM   r   r&   r'   r)   r(   c          	         t                                                       |j        j        }t	          |t
          t          f          sJ || _        t          |t          |d                    | _
        |j        r| j
        j        | _        n5t          |j        |j        |j        t          |d                    | _        t%          |j                  | _        | j
        j        | _        d S )Nmodelr   lm_head)r.   r(   )r;   r<   r=   r>   r@   r   r#   r?   r   r!   r   tie_word_embeddingsr   r   r   r   rA   r.   r   logits_processorr   r   s       r[   r<   zOlmo2ForCausalLM.__init__  s    )3&;"<=====#L,I,I
 
 

 % 	:2DLL)!"(5#FI66	  DL !00A B BJ6 	,,,r\   r   r_   c                 6    | j                             |          S r   )r   r   r   s     r[   r   z Olmo2ForCausalLM.embed_input_ids  s    z)))444r\   Nre   r   r   c                 8    |                      ||||          }|S )N)r   re   r   r   )r   )rX   r   re   r   r   rf   s         r[   rp   zOlmo2ForCausalLM.forward  s0     

!5'	 # 
 
 r\   rf   c                 <    |                      | j        |          }|S r   )r   r   )rX   rf   logitss      r[   compute_logitszOlmo2ForCausalLM.compute_logits  s      &&t|]CCr\   r   c                 l    t          | | j        j        rdgnd           }|                    |          S )Nzlm_head.weight)skip_prefixes)r   r?   r   r   )rX   r   loaders      r[   r   zOlmo2ForCausalLM.load_weights  sE    "&*k&EO!""4
 
 
 ""7+++r\   )NN)rq   rr   rs   rt   packed_modules_mappingr
   ru   r<   rv   rw   r   r"   rp   r   r   rx   r   ry   rz   s   @r[   r   r     sz        

 
 
 

 
 BD 
 
 
z 
3 
 
 
 
 
 
,5 5%, 5 5 5 5 <@-1 < < 2D8	
 |d* 
+	+   | 
	   ,HU33D-E$F , , , , , , , ,r\   r   )?rt   collections.abcr   	functoolsr   	itertoolsr   rv   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr	   vllm.configr
   vllm.distributedr   r   !vllm.distributed.communication_opr   vllm.distributed.parallel_stater   vllm.distributed.utilsr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r    vllm.model_executor.models.utilsr   r   r   r   r    r!   vllm.sequencer"   vllm.transformers_utils.configsr#   Moduler%   r|   r   r   r    r\   r[   <module>r      sR  2 F E $ $ $ $ $ $                    $ $ $ $ $ $ * * * * * * = = = = = = " " " " " " O O O O O O O O N N N N N N J J J J J J > > > > > > < < < < < < 8 8 8 8 8 8         
 H G G G G G @ @ @ @ @ @        P O O O O O J J J J J J J J                . - - - - - 7 7 7 7 7 7v v v v vRY v v vr* * * * *ry * * *Z, , , , ,	 , , ,^ a a a a a a a aHG, G, G, G, G,ry*l G, G, G, G, G,r\   