
     `i                        d dl mZmZ d dlZd dlmZ d dlmc mZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ d
dlmZ  ej        e          Z G d dej                  Z  G d de          Z!ddZ" G d de          Z# G d de          Z$ G d de          Z% G d de          Z& G d de          Z'g dZ(dS )    )CallableOptionalN   )Cache)ALL_ATTENTION_FUNCTIONS)logging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )OlmoLayerNormz/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 X    t                                                       |f| _        d S )N)super__init__normalized_shape)selfr   	__class__s     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/olmo/modular_olmo.pyr   zOlmoLayerNorm.__init__   s)    !,    hidden_statesc                     |j         }t          j        |                    t          j                  | j        d d d                              |          S )N)dtypegh㈵>)eps)r#   F
layer_normtotorchfloat32r   )r   r!   
orig_dtypes      r   forwardzOlmoLayerNorm.forward"   sS    "(
|M,,5=,AA4CXZ^`djnooorr
 
 	
r    )
__name__
__module____qualname____doc__intr   r(   Tensorr+   __classcell__r   s   @r   r   r      sw        99/C /D / / / / / /
U\ 
el 
 
 
 
 
 
 
 
r    r   c                        e Zd Z fdZ xZS )OlmoMLPc                 .   t                                          |           t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        d S )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_projr   configr   s     r   r   zOlmoMLP.__init__*   s|       4#3T5KRWXXXy!143IPUVVV4#94;KRWXXXr    )r,   r-   r.   r   r2   r3   s   @r   r5   r5   )   sA        Y Y Y Y Y Y Y Y Yr    r5   c                 &   | j         |j         }}|                    |          }|                    |          }| |z  t          |           |z  z   }||z  t          |          |z  z   }	|                    |          |	                    |          fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r#   	unsqueezer   r'   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r   apply_rotary_pos_embrL   1   s    ( WagFF
--
&
&C
--
&
&C3w;q>>C/0G3w;q>>C/0G::fwzz&1111r    c                       e Zd Z eddd          	 	 ddej        deej        ej        f         deej                 dee         d	eej	                 d
eej        eej                 f         fd            Z
dS )OlmoAttentionpast_key_valuepast_key_valuesz4.58)new_nameversionNr!   position_embeddingsattention_maskcache_positionr   c                 p   |j         d d         }g |d| j        R }|                     |          }	|                     |          }
|                     |          }| j        j        |	                    | j        j         | j        j                   |
                    | j        j         | j        j                   |                    | j        j         | j        j                   |	                    |          	                    dd          }	|
                    |          	                    dd          }
|                    |          	                    dd          }|\  }}t          |	|
||          \  }	}
|&|||d}|                    |
|| j        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        sdn| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )	N)minmaxr   r
   )rE   rD   rU   eagerg        )dropoutscaling)shapehead_dimq_projk_projv_projr?   clip_qkvclamp_view	transposerL   update	layer_idxr   _attn_implementationr   trainingattention_dropoutr\   reshape
contiguouso_proj)r   r!   rS   rT   rP   rU   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrD   rE   cache_kwargsattention_interfaceattn_outputattn_weightss                     r   r+   zOlmoAttention.forwardN   s_    $)#2#.88b8$-88{{=11[[//
{{=11;+T[%9$9t{?STTT4;#7"7T[=QRRRT[%9$9t{?STTT#((66@@AFF__\22<<QBB
#((66@@AFF&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7	%
  $}HCC$2HL	%
 	%
 	%
 	%
!\ *k);;;;;;FFHHkk+..L((r    )NN)r,   r-   r.   r	   r(   r1   tupler   r   
LongTensorr+    r    r   rN   rN   M   s        _%0A6RRR ,0592) 2)|2) #5<#=>2) !.	2)
 "%2) !!122) 
u|Xel33	42) 2) 2) SR2) 2) 2)r    rN   c                   (     e Zd Zdedef fdZ xZS )OlmoDecoderLayerr?   rg   c                     t                                          ||           t          |j                  | _        t          |j                  | _        t          ||          | _        d S )N)r?   rg   )r   r   r   r   input_layernormpost_attention_layernormrN   	self_attn)r   r?   rg   r   s      r   r   zOlmoDecoderLayer.__init__   s[    +++,V-?@@(5f6H(I(I%&f	JJJr    )r,   r-   r.   r   r0   r   r2   r3   s   @r   r|   r|      sW        Kz Kc K K K K K K K K K Kr    r|   c                       e Zd Zd ZdS )OlmoRotaryEmbeddingc                    | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	||	fcd d d            S # 1 swxY w Y   d S )
Nr   rW   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandr]   r'   device
isinstancetypestrr(   autocastre   catrD   attention_scalingrE   )
r   xrF   inv_freq_expandedposition_ids_expandedr   freqsembrD   rE   s
             r   r+   zOlmoRotaryEmbedding.forward   s    M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	 	&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C8	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   BE//E36E3N)r,   r-   r.   r+   rz   r    r   r   r      s#        
 
 
 
 
r    r   c                   $     e Zd Zdef fdZ xZS )	OlmoModelr?   c                     t                                                     t          j        fdt	          j                  D                       | _        t          j                  | _	        d S )Nc                 0    g | ]}t          |          S rz   )r|   ).0rg   r?   s     r   
<listcomp>z&OlmoModel.__init__.<locals>.<listcomp>   s$    bbbYfi00bbbr    )
r   r   r8   
ModuleListrangenum_hidden_layerslayersr   r   normr>   s    `r   r   zOlmoModel.__init__   si       mbbbb%H`BaBabbb
 
 "&"455			r    )r,   r-   r.   r   r   r2   r3   s   @r   r   r      sD        6z 6 6 6 6 6 6 6 6 6 6r    r   c                       e Zd ZdS )OlmoForCausalLMN)r,   r-   r.   rz   r    r   r   r      s        Dr    r   )r   r   OlmoPreTrainedModel)Nr   ))typingr   r   r(   torch.nnr8   torch.nn.functional
functionalr%   cache_utilsr   modeling_utilsr   utilsr   utils.deprecationr	   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr,   loggerModuler   r5   rL   rN   r|   r   r   r   __all__rz   r    r   <module>r      s   % % % % % % % %                             5 5 5 5 5 5       0 0 0 0 0 0	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 + * * * * * 
	H	%	%
 
 
 
 
BI 
 
 
Y Y Y Y Yh Y Y Y2 2 2 284) 4) 4) 4) 4)N 4) 4) 4)nK K K K K( K K K    .   6 6 6 6 6
 6 6 6	 	 	 	 	& 	 	 	  r    