
     `i|+                        d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ  ej         e!          Z"dZ#dZ$ G d dej%                  Z&d$dZ' G d dej%                  Z( G d de          Z) G d de          Z* G d de          Z+ G d d e          Z, G d! d"e          Z-g d#Z.dS )%zPyTorch Phi-3 model.    )CallableOptionalN)nn   )ACT2FN)Cache)GenerationMixin)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)logging)deprecate_kwarg   )MistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralPreTrainedModeleager_attention_forwardrotate_half   )
Phi3Configz microsoft/Phi-3-mini-4k-instructr   c                   B     e Zd Z fdZdej        dej        fdZ xZS )Phi3MLPc                 "   t                                                       || _        t          j        |j        d|j        z  d          | _        t          j        |j        |j        d          | _        t          |j
                 | _        d S )Nr   Fbias)super__init__configr   Linearhidden_sizeintermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fn)selfr    	__class__s     y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/phi3/modular_phi3.pyr   zPhi3MLP.__init__2   sz    If&8!f>V:V]bccc6#;V=OV[\\\#F$56    hidden_statesreturnc                     |                      |          }|                    dd          \  }}||                     |          z  }|                     |          S )Nr   dim)r$   chunkr'   r%   )r(   r,   	up_statesgates       r*   forwardzPhi3MLP.forward:   sX    %%m44	#//!/44i 2 24 8 88	~~i(((r+   )__name__
__module____qualname__r   torchFloatTensorr5   __classcell__r)   s   @r*   r   r   1   s`        7 7 7 7 7)U%6 )5;L ) ) ) ) ) ) ) )r+   r   c                    |                     |          }|                     |          }|j        d         }| dd|f         | d|df         }}|dd|f         |d|df         }
}	t          j        ||z  t	          |          |z  z   |gd          }t          j        |	|z  t	          |	          |z  z   |
gd          }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r/   .Nr0   )	unsqueezeshaper9   catr   )qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r*   apply_rotary_pos_embrN   C   s    ( --
&
&C
--
&
&C2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6Ei%#++e*<*<s*BCVLRTUUUGi%#++e*<*<s*BCVLRTUUUGGr+   c                   R    e Zd ZdZddedee         f fdZ eddd	          	 	 dd
e	j
        dee	j
        e	j
        f         dee	j
                 dee         dee	j                 dee         dee	j
        ee	j
                 eee	j
                          f         fd            Z xZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr    	layer_idxc                    t                                                       || _        || _        t	          |d|j        |j        z            | _        |j        |j        z  | _	        |j        | _        | j        dz  | _
        |j        | _        d| _        |j        | j        z  d|j        | j        z  z  z   }t          j        |j        | j        z  |j        d          | _        t          j        |j        |d          | _        d S )Nhead_dimg      Tr   Fr   )r   r   r    rQ   getattrr"   num_attention_headsrS   num_key_value_headsnum_key_value_groupsscalingattention_dropout	is_causalr   r!   o_projqkv_proj)r(   r    rQ   op_sizer)   s       r*   r   zPhi3Attention.__init__f   s    "
F4F&Jd4dee$*$>&B\$\!#)#= }d*!'!9,t}<qFD^aeanDn?ooi :T] JFL^ejkkk	&"4gEJJJr+   past_key_valuepast_key_values4.58new_nameversionr,   position_embeddingsattention_maskcache_positionkwargsr-   c           
         |j         d d         }g |d| j        R }|                     |          }	| j        j        | j        z  }
|	dd |
f         }|	d|
|
| j        | j        z  z   f         }|	d|
| j        | j        z  z   d f         }|                    |                              dd          }|                    |                              dd          }|                    |                              dd          }|\  }}t          ||||          \  }}|&|||d}|	                    ||| j
        |          \  }}t          }| j        j        dk    rt          | j        j                 } || ||||f| j        sdn| j        | j        t#          | j        dd           d	|\  }} |j        g |dR                                  }|                     |          }||fS )
Nr/   .r   r   )rD   rC   rf   eagerg        sliding_window)dropoutrX   rj   )r?   rS   r\   r    rU   rV   view	transposerN   updaterQ   r   _attn_implementationr   trainingrY   rX   rT   reshape
contiguousr[   )r(   r,   rd   re   r_   rf   rg   input_shapehidden_shapeqkv	query_posquery_states
key_statesvalue_statesrC   rD   cache_kwargsattention_interfaceattn_outputattn_weightss                       r*   r5   zPhi3Attention.forwardu   s>    $)#2#.88b8$-88mmM**K3dmC	3

?+i)d6NQUQ^6^*^^^_
3	D,Dt},T T V VVW#((66@@AFF__\22<<QBB
#((66@@AFF&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7
%
  $}HCC$2HL"4;0@$GG
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r+   )N)NN)r6   r7   r8   __doc__r   r   intr   r   r9   Tensortupler   
LongTensorr   r
   r5   r;   r<   s   @r*   rP   rP   c   s3       GGK Kz Khsm K K K K K K _%0A6RRR ,0590) 0)|0) #5<#=>0) !.	0)
 "%0) !!120) -.0) 
u|Xel3XeEL>Q5RR	S0) 0) 0) SR0) 0) 0) 0) 0)r+   rP   c                   t    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        de	ej                 de	ej
                 de	e         de	e         de	ej
                 de	eej        ej        f                  dee         deej        e	eej        ej        f                  f         fd            Z xZS )Phi3DecoderLayerr    rQ   c                 &   t                                          ||           || _        t          ||          | _        t          |          | _        t          j        |j	                  | _
        t          j        |j	                  | _        d S )N)r    rQ   )r   r   r    rP   	self_attnr   mlpr   Dropoutresid_pdropresid_attn_dropoutresid_mlp_dropout)r(   r    rQ   r)   s      r*   r   zPhi3DecoderLayer.__init__   su    +++&f	JJJ6??"$*V-?"@"@!#F,>!?!?r+   r^   r_   r`   ra   NFr,   re   rE   	use_cacherf   rd   rg   r-   c                    |}	|                      |          } | j        d|||||||d|\  }}
|	|                     |          z   }|}	|                     |          }|                     |          }|	|                     |          z   }|S )N)r,   re   rE   r_   r   rf   rd    )input_layernormr   r   post_attention_layernormr   r   )r(   r,   re   rE   r_   r   rf   rd   rg   residualself_attn_weightss              r*   r5   zPhi3DecoderLayer.forward   s     !,,];;+94> 	,
')%+) 3	,
 	,
 	,
 	,
(( !4#:#:=#I#II 55mDD// 4#9#9-#H#HHr+   )NNNFNN)r6   r7   r8   r   r   r   r   r9   r   r   r   r   boolr   r   r
   r:   r5   r;   r<   s   @r*   r   r      sU       @z @c @ @ @ @ @ @ _%0A6RRR 2637+/$)59KO | !. u/0	
 "% D> !!12 &eEL%,,F&GH -. 
u (51BEDU1U+V"WW	X   SR    r+   r   c                       e Zd ZdZdS )Phi3PreTrainedModelz0.0.5N)r6   r7   r8   _versionr   r+   r*   r   r      s        HHHr+   r   c                   $    e Zd Z	 	 	 	 	 	 	 ddZdS )Phi3ForCausalLMNTc	                     |rD| j         j        r8|j        d         | j         j        dz   k    r|d         }
|
| j         j        k    rd }t	          j        | f||||||||d|	}|S )Nr   r   )	input_idsr_   re   inputs_embedsrf   rE   r   logits_to_keep)r    rope_scalingr?    original_max_position_embeddingsr	   prepare_inputs_for_generation)r(   r   r_   re   r   rf   rE   r   r   rg   past_lengthmodel_inputss               r*   r   z-Phi3ForCausalLM.prepare_inputs_for_generation   s    $ 	'(	' "dk&RUV&VVV(+KdkJJJ"&&D
+)')%)
 
 
 
 r+   )NNNNNTN)r6   r7   r8   r   r   r+   r*   r   r      s?         & & & & & &r+   r   c                       e Zd ZdS )Phi3ForSequenceClassificationNr6   r7   r8   r   r+   r*   r   r             Dr+   r   c                       e Zd ZdS )Phi3ForTokenClassificationNr   r   r+   r*   r   r     r   r+   r   )r   	Phi3Modelr   r   r   )Nr   )/r~   typingr   r   r9   r   activationsr   cache_utilsr   
generationr	   modeling_flash_attention_utilsr
   modeling_utilsr   processing_utilsr   utilsr   utils.deprecationr   mistral.modeling_mistralr   r   r   r   r   r   r   configuration_phi3r   
get_loggerr6   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCModuler   rN   rP   r   r   r   r   r   __all__r   r+   r*   <module>r      s      % % % % % % % %        ! ! ! ! ! !             ) ) ) ) ) ) B B B B B B 5 5 5 5 5 5 & & & & & &       0 0 0 0 0 0                  + * * * * * 
	H	%	%8 ) ) ) ) )bi ) ) )$   @C) C) C) C) C)BI C) C) C)L( ( ( ( (* ( ( (V    0   ' ' ' ' '( ' ' 'T	 	 	 	 	$D 	 	 		 	 	 	 	!> 	 	 	  r+   