
     `iǼ                        d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+  e&            rddl,m-Z- ddl.m/Z/  e'j0        e1          Z2 G d dej3                  Z4	 d4dej5        dej6        dej6        dej6        deej6                 de7de7fd Z8 G d! d"ej5                  Z9 G d# d$e          Z:e$ G d% d&e                      Z; G d' d(e;          Z<e$ G d) d*e;                      Z= G d+ d,e;e          Z> e$d-.           G d/ d0e;                      Z?e$ G d1 d2e;                      Z@g d3ZAdS )5zPyTorch OPT model.    )CallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                   j     e Zd ZdZdedef fdZ	 	 ddej        ded	eej                 f fd
Z	 xZ
S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 j    d| _         t                                          || j         z   |           d S N   )offsetsuper__init__)selfr%   r&   	__class__s      x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/opt/modeling_opt.pyr,   z&OPTLearnedPositionalEmbedding.__init__8   s3     $+5}EEEEE    r   Nattention_maskpast_key_values_lengthposition_idsc                     |>t          j        |d          }||z  dz
                                  }|dd|df         }t                                          || j        z             S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr+   forwardr*   )r-   r1   r2   r3   r.   s       r/   r:   z%OPTLearnedPositionalEmbedding.forward>   sq      <A>>>L(>9A=CCEEL'+A+B+B(BCLww|dk9:::r0   )r   N)__name__
__module____qualname____doc__intr,   r7   
LongTensorr   r:   __classcell__r.   s   @r/   r$   r$   3   s         Fs F3 F F F F F F '(37	; ;(; !$; u/0	; ; ; ; ; ; ; ; ; ;r0   r$           modulequerykeyvaluer1   scalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )N)r6   dtypeptrainingr   r)   )r7   matmul	transposer   
functionalsoftmaxfloat32torM   rI   rP   
contiguous)
rD   rE   rF   rG   r1   rH   rI   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardr[   P   s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r0   c                   *    e Zd ZdZ	 ddedee         f fdZ eddd	          	 	 	 	 	 dde	j
        dee         dee	j
                 dee	j
                 dedee	j
                 dee	j
        ee	j
                 ee         f         fd            Z xZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                 4   t                                                       || _        |j        | _        |j        | _        |j        | _        |j	        | _	        || _
        |(t                              d| j        j         d           | j        | j        z  | _        d| _        | j        | j        z  | j        k    r t#          d| j         d| j         d          | j        dz  | _        t'          j        | j        | j        | j	                  | _        t'          j        | j        | j        | j	                  | _        t'          j        | j        | j        | j	                  | _        t'          j        | j        | j        | j	                  | _        d S )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r+   r,   r^   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrI   enable_biasr_   loggerwarning_oncer.   r;   head_dim	is_causal
ValueErrorrH   r   Lineark_projv_projq_projout_proj)r-   r^   r_   rX   r.   s       r/   r,   zOPTAttention.__init__j   s    	+3/!-",!8 , , ,   $.8MDN*t~==8dn 8 8%)^8 8 8   }d*iTEUVVViTEUVVViTEUVVV	$.$.tGWXXXr0   past_key_valuepast_key_values4.58new_nameversionFhidden_statesr1   layer_head_maskoutput_attentionscache_positionreturnc                 v   |                                 \  }}	}
|                     |          | j        z  }|                    |d| j        | j                                      dd          }|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|"|	                    ||| j
        d|i          \  }}t          }| j        j        dk    rt          | j        j                 } || ||||f| j        sdn| j        dd	|\  }}|                    ||	d                                          }|                     |          }|sd}||fS )
z#Input shape: Batch x Time x ChannelrK   r   r)   Nr|   eagerrC         ?)rI   rH   )sizerq   rH   viewrf   rk   rR   ro   rp   updater_   r[   r^   _attn_implementationr   rP   rI   reshaperW   rr   )r-   ry   rt   r1   rz   r{   r|   rX   bsztgt_len_query_states
key_statesvalue_statesattention_interfacerZ   rY   s                    r/   r:   zOPTAttention.forward   s    (,,..Wa {{=11DL@#((b$.$-PPZZ[\^_``[[//
{{=11__S"dndmLLVVWXZ[\\
#((b$.$-PPZZ[\^_``&'6'='=L$.;K^:\( ($J )@;+w66"9$+:Z"[$7$7	%
  $}>CC$,	%
 	%
 	%
 	%
!\ "))#w;;FFHHmmK00  	 LL((r0   N)NNNFN)r;   r<   r=   r>   r    r   r?   r,   r   r7   Tensorr   booltupler:   rA   rB   s   @r/   r]   r]   g   s0       GG
 $(!Y !Y!Y C=!Y !Y !Y !Y !Y !YF _%0A6RRR ,01526"'156) 6)|6) "%6) !.	6)
 "%,/6)  6) !.6) 
u|Xel3Xe_D	E6) 6) 6) SR6) 6) 6) 6) 6)r0   r]   c                   z    e Zd Zddedee         f fdZ eddd          	 	 	 	 	 	 	 dd
ej	        deej	                 deej	                 dee
         dee         dee         deej                 deej	                 dee         deej        eeej        ej        f                  f         fd            Z xZS )OPTDecoderLayerNr^   r_   c                 *   t                                                       |j        | _        t	          ||          | _        |j        | _        |j        | _        t          |j	                 | _
        t          j        | j        |j                  | _        t          j        | j        |j        |j                  | _        t          j        |j        | j        |j                  | _        t          j        | j        |j                  | _        d S )N)r^   r_   elementwise_affinera   )r+   r,   rc   rd   r]   	self_attndo_layer_norm_beforerI   r   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normrn   ffn_dimrh   fc1fc2final_layer_norm)r-   r^   r_   r.   s      r/   r,   zOPTDecoderLayer.__init__   s    +%VyIII$*$?!~#F$>?$&LNv/S%
 %
 %
! 9T^V^&BTUUU9V^T^&BTUUU "T^PVPt u u ur0   rs   rt   ru   rv   Fry   r1   rz   r{   	use_cacher3   r|   rX   r}   c	                    |}
| j         r|                     |          } | j        d|||||||d|	\  }}t          j                            || j        | j                  }|
|z   }| j         s|                     |          }|j        }|                    d|	                    d                    }|}
| j         r| 
                    |          }|                     |          }|                     |          }|                     |          }t          j                            || j        | j                  }|
|z                       |          }| j         s| 
                    |          }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )ry   rt   r3   r1   rz   r{   r|   rN   rK    )r   r   r   r   rS   rI   rP   shaper   r   r   r   r   r   r   )r-   ry   r1   rz   rt   r{   r   r3   r|   rX   residualself_attn_weightshidden_states_shapeoutputss                 r/   r:   zOPTDecoderLayer.forward   s   > ! $ 	E 55mDDM ,:4> 	,
'+%)+/)	,
 	,
 	,
 	,
(( --mt|VZVc-dd =0 ( 	E 55mDDM ,1%--b-2D2DR2H2HII  $ 	A 11-@@M//**=99//--mt|VZVc-dd!M1778KLL ( 	A 11-@@M " 	,)++Gr0   r   )NNNFFNN)r;   r<   r=   r    r   r?   r,   r   r7   r   r   r   r@   r   r   r   FloatTensorr:   rA   rB   s   @r/   r   r      sv       v vy vXc] v v v v v v" _%0A6RRR 2626+/,1$)3715P P|P !.P "%,/	P
 "%P $D>P D>P u/0P !.P -.P 
u (51BEDU1U+V"WW	XP P P SRP P P P Pr0   r   c                   B    e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZdZd ZdS )OPTPreTrainedModelr^   modelTr   c                 v   | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 d S d S t          |t          j
                  rU|j        j                            d|           |j        +|j        j        |j                 	                                 d S d S t          |t          j                  r?|j        j                            d           |j        j        	                                 d S d S )NrC   )meanstdr   )r^   init_std
isinstancer   rn   weightdatanormal_rb   zero_	Embeddingpadding_idxr   fill_)r-   rD   r   s      r/   _init_weightsz OPTPreTrainedModel._init_weights:  s'   k"fbi(( 
	%M&&CS&999{& &&((((( '&-- 	%M&&CS&999!-"6#56<<>>>>> .--- 	%M$$S)))K""$$$$$	% 	%r0   N)r;   r<   r=   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r0   r/   r   r   -  sc         &*#*+"&N!% % % % %r0   r   c                       e Zd ZdZdef fdZ	 ddeej        df         dej        dej        d	e	d
e
f
dZedej        dededej        dej        defd            Ze	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	ee	         deej                 dee
         d
ee
         dee
         dee
         deej                 deej                 dee         deeef         fd            Z xZS )
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    r^   c                 h   t                                                     j        | _        j        | _        j        | _        j        | _        j        | _        t          j
        j        j        | j                  | _        t          j        j                  | _        j        j        k    r't          j        j        j        d          | _        nd | _        j        j        k    r't          j        j        j        d          | _        nd | _        j        r-j        s&t          j        j        j                  | _        nd | _        t          j        fdt3          j                  D                       | _        d| _        |                                  d S )NFra   r   c                 2    g | ]}t          |           S ))r_   )r   ).0ir^   s     r/   
<listcomp>z'OPTDecoder.__init__.<locals>.<listcomp>p  s&    $s$s$sa_Vq%I%I%I$s$s$sr0   )r+   r,   rI   	layerdroppad_token_idr   max_position_embeddingsmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr$   rc   embed_positionsrn   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointing	post_initr-   r^   r.   s    `r/   r,   zOPTDecoder.__init__Q  s      ~)!.$*$B! +L):F<VX\Xhii<V=[]c]opp%);;;!y);V=W^cdddD#D%);;; i(BFDV]bcccDOO"DO
 & 	)v/N 	)$&L"v7[% % %D!! %)D!m$s$s$s$sSXY_YqSrSr$s$s$stt&+#r0   Fr1   r!   input_tensorr|   rt   r{   c           	      $   | j         j        dk    r||dk                                    r|S d S | j         j        dk    r+t          |t          j                  rt          |          }|S ||                                nd}||j        nd}| j         j        dk    r#|s!|st          j
        |||| j                  rd S |j        }|j        d         }	|r|                                }
n/t          |t          j                  r|j        d	         n||	z   dz   }
|                     ||	|
|||j        d         
          }| j         j        dk    r@|>|j        j        dv r0|s.t	          j        |          j        }t          j        ||          }|S )Nflash_attention_2rC   flex_attentionr   Fsdpa)inputs_embedsr2   is_trainingr   rK   )sequence_lengthtarget_lengthrM   r|   
batch_size)cudaxpunpu)r^   r   anyr   r7   r   r"   get_seq_lengthis_compileabler   _ignore_causal_mask_sdparP   rM   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfomin_unmask_unattended)r-   r1   r   r|   rt   r{   past_seen_tokensusing_compilable_cacherM   r   r   causal_mask	min_dtypes                r/   _update_causal_maskzOPTDecoder._update_causal_maskw  s    ;+/BBB)~/D.I.I.K.K)%%4;+/???.%,77 M!<^!L!L!!
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5]n5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr0   r   r   rM   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerM   r   r   )diagonalr   rK   r   )r6   r7   r   r   fullr   triuaranger   expandcloner   rV   masked_fill)r1   r   r   rM   r|   r   rX   r   r   mask_lengthpadding_masks              r/   r   z@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r0   N	input_ids	head_maskr   r   output_hidden_statesreturn_dictr3   rX   r}   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	|du |duz  rt          d          | j        r%| j        r|rt          	                    d           d}|!|
                    d|j        d                   }||                     |          }|r|t          | j                   }||                                nd}|*t          j        |||j        d         z   |j        	          }|7||j        d         z   }t          j        |j        d         ||j        	          }|                     |||||          }|
>t          j        |d
          }
|
|z  dz
                                  }
|
dd|df         }
|                     |||
          }| j        |                     |          }||                    |j                  z   }|rdnd}|rdnd}t3          |gdg          D ]z\  }}|s|                                d         t7          | j                  k    rCt          d| dt7          | j                   d|                                d          d          {t;          | j                  D ]e\  }}|r||fz  }| j        r t          j        g           }|| j        k     r4 ||f||
|||         nd||||d|}|d         }|r||d         fz  }f| j         |                      |          }| j!        | !                    |          }|r||fz  }tE          ||||          S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrK   )r^   r   r   r   r5   )r3   r   r  zThe `z` should be specified for z layers, but it is for .)r1   r3   rz   rt   r{   r   r|   last_hidden_statert   ry   
attentions)#r^   r{   r  r   use_return_dictrm   r   rP   ri   rj   r   r   r   r   r   r7   r   r   onesr   r8   r9   r   r   rV   zipr   lenr   	enumeraterandr   r   r   r   )r-   r  r1   r  rt   r   r   r{   r  r	  r3   r|   rX   r   
seq_lengthr   
pos_embedsry   all_hidden_statesall_self_attns	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                             r/   r:   zOPTDecoder.forward  sr   N 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	[YZZZ& 	4= 	Y 	j   I !r9?2+>??I  --i88M 	?0*$+>>>O?N?Z?99;;;`a!"\ "2]5H5K"KTaTh  N !)M,?,BBJ"Z(;A(>
S`SghhhN..M>?L]
 

  <A>>>L(>9A=CCEEL'+;+<+<(<=L)).:JYe)ff
?& OOM::M%
m6J(K(KK #7@BBD0:d %(k]$C$C 	 	 Iy$>>##A&3t{+;+;<<$3	 3 3SEUEU 3 3%NN,,Q/3 3 3  
 #,DK"8"8 	6 	6C# 6!m%55!} &+jnn#&77)M
*)3<3H3d /"3#-
 
 
 
M *!,M  6=#3"55 , 11-@@M' ,,];;M   	2-!11&+++%	
 
 
 	
r0   )FNNNNNNNNNNN)r;   r<   r=   r>   r    r,   r   r7   r   r   r   r   staticmethodr?   rM   r   r   r   r@   r   r   r   r   r   r:   rA   rB   s   @r/   r   r   I  sH        #y # # # # # #X #(B BelK78B lB 	B
 B  B B B BH 444 4 {	4
 4 4 4 4 \4l  1515,0+/59$(,0/3&*3715t
 t
E,-t
 !.t
 EL)	t

 "%t
   12t
 D>t
 $D>t
 'tnt
 d^t
 u/0t
 !.t
 -.t
 
u--	.t
 t
 t
 t
 t
 t
 t
 t
r0   r   c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 d	e	e         d
e	e
j                 de	e         de	e         de	e         de	e         de	e
j                 de	e
j                 dee         deeef         fd                        Z xZS )OPTModelr^   c                     t                                          |           t          |          | _        |                                  d S r   )r+   r,   r   decoderr   r   s     r/   r,   zOPTModel.__init__  s@       !&))r0   c                     | j         j        S r   r$  r   r-   s    r/   get_input_embeddingszOPTModel.get_input_embeddings  s    |((r0   c                     || j         _        d S r   r&  r-   rG   s     r/   set_input_embeddingszOPTModel.set_input_embeddings  s    $)!!!r0   Nr  r1   r  rt   r   r   r{   r  r	  r3   r|   rX   r}   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	 | j        d|||
||||||d|d|}t          |j        |j        |j	        |j
                  S )NTr  r1   r3   r  rt   r   r   r{   r  r	  r|   r  r   )r^   r{   r  r   r  r$  r   r  rt   ry   r  )r-   r  r1   r  rt   r   r   r{   r  r	  r3   r|   rX   decoder_outputss                 r/   r:   zOPTModel.forward  s    " 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] '$, 
)%+'/!5)
 
 
 
 '-?+;)7&1	
 
 
 	
r0   r  )r;   r<   r=   r    r,   r(  r+  r   r   r   r7   r@   r   r   r   r   r   r   r   r   r   r:   rA   rB   s   @r/   r"  r"    s       y      ) ) )* * *  1515,0+/59$(,0/3&*3715+
 +
E,-+
 !.+
 EL)	+

 "%+
   12+
 D>+
 $D>+
 'tn+
 d^+
 u/0+
 !.+
 -.+
 
u--	.+
 +
 +
 ^ +
 +
 +
 +
 +
r0   r"  c            !           e Zd ZdgZ fdZd Zd Zd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 dee         deej                 deej                 dee         dee         dee         dee         deej                 deej                 dee         deeef         fd                        Z xZS )OPTForCausalLMzlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S NFra   )
r+   r,   r"  r   r   rn   r   r   lm_headr   r   s     r/   r,   zOPTForCausalLM.__init__  sc       f%%
 y!;V=NUZ[[[ 	r0   c                 $    | j         j        j        S r   r   r$  r   r'  s    r/   r(  z#OPTForCausalLM.get_input_embeddings      z!..r0   c                 (    || j         j        _        d S r   r5  r*  s     r/   r+  z#OPTForCausalLM.set_input_embeddings      */
'''r0   c                     || j         _        d S r   r   r$  )r-   r$  s     r/   set_decoderzOPTForCausalLM.set_decoder  s    $
r0   c                     | j         j        S r   r:  r'  s    r/   get_decoderzOPTForCausalLM.get_decoder  s    z!!r0   Nr  r1   r  rt   r   labelsr   r{   r  r	  r3   r|   rX   r}   c                    ||n| j         j        }|	|	n| j         j        }	|
|
n| j         j        }
 | j        j        d|||||||||	d|d|}|                     |d                                                   }d}|5|                    |j	                  } | j
        ||fd| j         j        i|}t          |||j        |j        |j                  S )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr-  r   r   losslogitsrt   ry   r  r   )r^   r{   r  r  r   r$  r3  rW   rV   r   loss_functionr   r   rt   ry   r  )r-   r  r1   r  rt   r   r>  r   r{   r  r	  r3   r|   rX   r   rB  rA  s                    r/   r:   zOPTForCausalLM.forward  sB   R 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] %$*$ 
)%+'/!5)
 
 
 
 gaj))4466YYv}--F%4%   ;1 	 D &#3!/)
 
 
 	
r0   NNNNNNNNNNNN)r;   r<   r=   _tied_weights_keysr,   r(  r+  r;  r=  r   r   r   r7   r@   r   r   r   r   r   r   r   r   r   r:   rA   rB   s   @r/   r0  r0    s       *+    / / /0 0 0% % %" " "  1515,0+/59-1$(,0/3&*3715P
 P
E,-P
 !.P
 EL)	P

 "%P
   12P
 )*P
 D>P
 $D>P
 'tnP
 d^P
 u/0P
 !.P
 +,P
 
u,,	-P
 P
 P
 ^ P
 P
 P
 P
 P
r0   r0  a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   `    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 dee
         deej	                 d	eej                 d
ee         dee         dee         dee         deej                 deeef         fd            Zd Zd Z xZS )OPTForSequenceClassificationr^   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        | j        d          | _        | 	                                 d S r2  )
r+   r,   
num_labelsr"  r   r   rn   r   scorer   r   s     r/   r,   z%OPTForSequenceClassification.__init__f  sj        +f%%
Yv94?QVWWW
 	r0   Nr  r1   r  rt   r   r>  r   r{   r  r	  r3   r}   c                    |
|
n| j         j        }
|                     |||||||||	|

  
        }|d         }|                     |          }||j        dd         \  }}n|j        dd         \  }}| j         j        |dk    rt          d          | j         j        d}n|}|| j         j        k                        |j        t          j
                  }t          j        |j        d         |j        t          j
                  }||z                      d          }n)d}t                              | j        j         d	           |t          j        ||j        
          |f         }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt-                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt1                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt5                      } |||          }|
s|f|dd         z   }||f|z   n|S t7          |||j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	rt   r1   r3   r  r   r   r{   r  r	  r   r)   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rK   )r   rM   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr@  )r^   r  r   rK  r   r   rm   rV   r   r7   int32r   argmaxri   rj   r.   r;   problem_typerJ  rM   r9   r?   r	   squeezer   r   r   r   rt   ry   r  )r-   r  r1   r  rt   r   r>  r   r{   r  r	  r3   transformer_outputsry   rB  r   r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsrA  loss_fctoutputs                           r/   r:   z$OPTForSequenceClassification.forwardo  sL   * &1%<kk$+B]"jj+)%'/!5# ) 
 
 ,A.M** *3/"1"*='J*7*=bqb*A'J;#+
a\]]];#+!#"%)AAEEfmUZU`aaL!L)<V]Z_ZefffM"/,">!F!Fr!J!J!#>* Z Z Z  
 u|Jv}MMMOaab{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r0   c                 $    | j         j        j        S r   r5  r'  s    r/   r(  z1OPTForSequenceClassification.get_input_embeddings  r6  r0   c                 (    || j         j        _        d S r   r5  r*  s     r/   r+  z1OPTForSequenceClassification.set_input_embeddings  r8  r0   r  )r;   r<   r=   r    r,   r   r   r7   r@   r   r   r   r   r   r   r:   r(  r+  rA   rB   s   @r/   rH  rH  W  s       y        156:15+/59-1$(,0/3&*37\
 \
E,-\
 !!23\
 E-.	\

 "%\
   12\
 )*\
 D>\
 $D>\
 'tn\
 d^\
 u/0\
 
u66	7\
 \
 \
 ^\
|/ / /0 0 0 0 0 0 0r0   rH  c                   |    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 dee
         deej	                 d	eej                 d
eej                 dee         dee         dee         dee         deej                 deeef         fd            Zd Zd Z xZS )OPTForQuestionAnsweringr^   c                     t                                          |           t          |          | _        t	          j        |j        d          | _        |                                  d S r(   )	r+   r,   r"  r   r   rn   r   
qa_outputsr   r   s     r/   r,   z OPTForQuestionAnswering.__init__  sX       f%%
)F$>BB 	r0   Nr  r1   r  rt   r   start_positionsend_positionsr   r{   r  r	  r3   r}   c                    ||n| j         j        }|                     ||||||||	|
|
  
        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          
                    |j                  }|	                    d|          
                    |j                  }t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```NrM  r   r   rK   r5   )ignore_indexr)   )rA  start_logits
end_logitsry   r  )r^   r  r   ra  splitrT  rW   r  r   clamprV   r   r   r   ry   r  )r-   r  r1   r  rt   r   rb  rc  r   r{   r  r	  r3   rU  ry   rB  rf  rg  
total_lossignored_indexrZ  
start_lossend_lossr[  s                           r/   r:   zOPTForQuestionAnswering.forward  s;   ` &1%<kk$+B]"jj+)%'/!5# ) 
 
 ,A.//#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEHHWWO)//=AADDV]SSM']CCCH!,@@Jx
M::H$x/14J 	R"J/2Eabb2IIF/9/EZMF**6Q+%!-;*5
 
 
 	
r0   c                 $    | j         j        j        S r   r5  r'  s    r/   r(  z,OPTForQuestionAnswering.get_input_embeddingsA  r6  r0   c                 (    || j         j        _        d S r   r5  r*  s     r/   r+  z,OPTForQuestionAnswering.set_input_embeddingsD  r8  r0   rD  )r;   r<   r=   r    r,   r   r   r7   r@   r   r   r   r   r   r   r:   r(  r+  rA   rB   s   @r/   r_  r_    s       y        156:15+/596:48$(,0/3&*37_
 _
E,-_
 !!23_
 E-.	_

 "%_
   12_
 "%"23_
   01_
 D>_
 $D>_
 'tn_
 d^_
 u/0_
 
u22	3_
 _
 _
 ^_
B/ / /0 0 0 0 0 0 0r0   r_  )r0  r"  r   rH  r_  )rC   )Br>   typingr   r   r   r7   r   torch.nnr   r   r	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   configuration_optr    !torch.nn.attention.flex_attentionr!   integrations.flex_attentionr"   
get_loggerr;   ri   r   r$   Moduler   floatr[   r]   r   r   r   r"  r0  rH  r_  __all__r   r0   r/   <module>r     sd     , , , , , , , , , ,        A A A A A A A A A A ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) > > > > > > B B B B B B 9 9 9 9 9 9            G F F F F F F F & & & & & & p p p p p p p p p p p p p p 0 0 0 0 0 0 ( ( ( ( ( (  !! K;;;;;;JJJJJJ 
	H	%	%; ; ; ; ;BL ; ; ;H % %I%<% 
% <	%
 U\*% % % % % %.]) ]) ]) ]) ])29 ]) ]) ])@c c c c c0 c c cL % % % % % % % %6_
 _
 _
 _
 _
# _
 _
 _
D :
 :
 :
 :
 :
! :
 :
 :
zk
 k
 k
 k
 k
' k
 k
 k
\   m0 m0 m0 m0 m0#5 m0 m0 m0` o0 o0 o0 o0 o00 o0 o0 o0d  r0   