
     `i                     n   d dl Z d dlmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
 ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+  e&            rddl,m-Z-m.Z.  e'j/        e0          Z1 G d dej2                  Z3 G d dej2                  Z4	 	 	 d3dej5        dej6        dej6        dej6        deej6                 dee7         de7deej6                 fd Z8 G d! d"ej5                  Z9 G d# d$e          Z:e% G d% d&e                       Z;e% G d' d(e;                      Z< e%d)*           G d+ d,e;e                      Z=e% G d- d.e;                      Z> e%d/*           G d0 d1e;                      Z?g d2Z@dS )4    N)CallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogging)deprecate_kwarg   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                   j     e Zd ZdZdedef fdZ	 	 ddej        ded	eej                 f fd
Z	 xZ
S ) BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 j    d| _         t                                          || j         z   |           d S )N   )offsetsuper__init__)selfr$   r%   	__class__s      ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/biogpt/modeling_biogpt.pyr*   z)BioGptLearnedPositionalEmbedding.__init__<   s3     $+5}EEEEE    r   Nattention_maskpast_key_values_lengthposition_idsc                     |>t          j        |d          }||z  dz
                                  }|dd|df         }t                                          || j        z             S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr)   forwardr(   )r+   r/   r0   r1   r,   s       r-   r8   z(BioGptLearnedPositionalEmbedding.forwardB   sq      <A>>>L(>9A=CCEEL'+A+B+B(BCLww|dk9:::r.   )r   N)__name__
__module____qualname____doc__intr*   r5   
LongTensorr   r8   __classcell__r,   s   @r-   r#   r#   7   s         Fs F3 F F F F F F '(37	; ;(; !$; u/0	; ; ; ; ; ; ; ; ; ;r.   r#   c            
       \     e Zd ZdZd
dedededee         f fdZdej	        f fd	Z
 xZS )BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?r$   r%   padding_idxembed_scalec                 \    t                                          |||           || _        d S N)r)   r*   rE   )r+   r$   r%   rD   rE   r,   s        r-   r*   z"BioGptScaledWordEmbedding.__init__X   s-    DDD&r.   	input_idsc                 V    t                                          |          | j        z  S rG   )r)   r8   rE   )r+   rH   r,   s     r-   r8   z!BioGptScaledWordEmbedding.forward\   s!    wwy))D,<<<r.   )rC   )r9   r:   r;   r<   r=   r   floatr*   r5   Tensorr8   r?   r@   s   @r-   rB   rB   S   s         ' 's '3 'S '_ghm_n ' ' ' ' ' '= = = = = = = = = = =r.   rB           modulequerykeyvaluer/   scalingdropout	head_maskc                    ||                     d          dz  }t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	d          }	||	|                    dddd          z  }	t          j                            |	|| j	                  }	t          j        |	|          }
|
                    dd          
                                }
|
|	fS )N      r'   r	   r3   r   ptraining)sizer5   matmul	transposenn
functionalsoftmaxviewrR   rY   
contiguous)rM   rN   rO   rP   r/   rQ   rR   rS   kwargsattn_weightsattn_outputs              r-   eager_attention_forwardre   `   s     **R..D(<s}}Q':':;;gEL!#n4=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$r.   c                       e Zd ZdZ	 	 	 	 	 	 ddededed	ed
ededee         dee         f fdZ	 e
ddd          	 	 	 	 	 	 ddej        deej                 dee         deej                 deej                 dedeej                 dee         deej        eej                 eeej                          f         fd            Z xZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrL   FTN	embed_dim	num_headsrR   
is_decoderbias	is_causalconfig	layer_idxc	                 z   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        || _        |/| j	        r(t                              d| j        j         d           t!          j        |||          | _        t!          j        |||          | _        t!          j        |||          | _        t!          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rV   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rk   )r)   r*   rh   ri   rR   head_dimrm   
ValueErrorrQ   rj   rl   rn   loggerwarning_oncer,   r9   r]   Lineark_projv_projq_projout_proj)
r+   rh   ri   rR   rj   rk   rl   rm   rn   r,   s
            r-   r*   zBioGptAttention.__init__   sY    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"",4>+B , , ,   i	94@@@i	94@@@i	94@@@	)YTBBBr.   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesr/   layer_head_maskoutput_attentionscache_positionrb   returnc                    |du}	|j         dd         \  }
}|	r|j         d         n|}|
|d| j        f}|
|d| j        f} |                     |          j        |                     dd          }d}|Ht          |t                    r1|j                            | j	                  }|	r|j
        }n
|j        }n|}|	r|n|}|	r3|1|r/|j        | j	                 j        }|j        | j	                 j        }n|                     |          }|                     |          } |j        |                     dd          } |j        |                     dd          }|N|	s|nd}|                    ||| j	        d|i          \  }}|	r$t          |t                    rd|j        | j	        <   t$          }| j        j        dk    rt*          | j        j                 } || ||||f| j        sd	n| j        | j        ||d
|\  }}|                    |
|d                                          }|                     |          }||fS )z#Input shape: Batch x Time x ChannelNrU   r   r'   Fr   TeagerrL   )rR   rQ   r   rS   )shaperq   rx   r`   r\   
isinstancer   
is_updatedgetrn   cross_attention_cacheself_attention_cachelayerskeysvaluesrv   rw   updatere   rm   _attn_implementationr   rY   rR   rQ   reshapera   ry   )r+   r   r   r{   r/   r   r   r   rb   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesattention_interfacerd   rc   s                           r-   r8   zBioGptAttention.forward   s   $ .T9 %*3B3/W/AN"(++wgr4=9wDM: 7t{{=116FPPQRTUVV
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#-?R))] 	F/"=*"=,3DNCHJ.5dnELLL^44J;;~66L(.9CCAqIIJ,<,n=GG1MML*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>(?;+w66"9$+:Z"[$7$7%
  $}>CC$,L/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L((r.   )rL   FTFNN)NNNNFN)r9   r:   r;   r<   r=   rJ   boolr   r   r*   r   r5   rK   r   r   r   tupler8   r?   r@   s   @r-   rg   rg   ~   s       GG  )-#'%C %C%C %C 	%C
 %C %C %C &%C C=%C %C %C %C %C %CN _%0A6RRR 48+/1526"'15R) R)|R) #5<0R) "%	R)
 !.R) "%,/R)  R) !.R) -.R) 
u|Xel3XeEL>Q5RR	SR) R) R) SRR) R) R) R) R)r.   rg   c                   z    e Zd Zddedee         f fdZ eddd          	 	 	 	 	 	 	 ddej	        deej	                 deej	                 dee
         dee         dee         deej                 deej	                 dee         deej        eeej        ej        f                  f         fd            Z xZS )BioGptDecoderLayerNrm   rn   c           	         t                                                       |j        | _        t	          | j        |j        |j        dd||          | _        |j        | _	        t          |j                 | _        |j        | _        t          j        | j                  | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S )NT)rh   ri   rR   rj   rl   rm   rn   )r)   r*   hidden_sizerh   rg   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrR   r
   
hidden_actactivation_fnactivation_dropoutr]   	LayerNormself_attn_layer_normru   intermediate_sizefc1fc2final_layer_norm)r+   rm   rn   r,   s      r-   r*   zBioGptDecoderLayer.__init__   s    +(n07
 
 
 1#F$56"(";$&L$@$@!9T^V-EFF9V5t~FF "T^ < <r.   rz   r{   r|   r}   FTr   r/   r   r   	use_cacher1   r   rb   r   c	                 B   |}
|                      |          } | j        d|||||||d|	\  }}t          j                            || j        | j                  }|
|z   }|}
|                     |          }|                     |          }|                     |          }t          j                            || j	        | j                  }| 
                    |          }t          j                            || j        | j                  }|
|z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )r   r{   r/   r   r   r1   r   rW    )r   r   r]   r^   rR   rY   r   r   r   r   r   )r+   r   r/   r   r{   r   r   r1   r   rb   residualself_attn_weightsoutputss                r-   r8   zBioGptDecoderLayer.forward  sO   > !11-@@ ,:4> 	,
'+)+/%)	,
 	,
 	,
 	,
(( --mt|VZVc-dd =0 !--m<<//**=99--mt?Vaean-oo//--mt|VZVc-dd =0 " 	,)++Gr.   rG   )NNNFTNN)r9   r:   r;   r   r   r=   r*   r   r5   rK   r   r   r>   r   r   r   FloatTensorr8   r?   r@   s   @r-   r   r      sZ       = =| = = = = = = =. _%0A6RRR 2626+/,1$(3715? ?|? !.? "%,/	?
 "%? $D>? D>? u/0? !.? +,? 
u (51BEDU1U+V"WW	X? ? ? SR? ? ? ? ?r.   r   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
deeej        df                  dej        dej        defd	Zedej        d
ededej        dej        defd            ZdS )BioGptPreTrainedModelrm   biogptTr/   r    input_tensorr   r{   c           	         | j         j        dk    rnt          |t          j                  rt          |          }nB|@t          t          j        |j        d         |j        d         f|j                            }|S | j         j        dk    r||dk    	                                r|S d S ||
                                nd}||j        nd}| j         j        dk    r!|st          j        |||| j        	          rd S |j        }|j        d         }|r|                                }	n/t          |t          j                  r|j        d
         n||z   dz   }	|                     |||	|||j        d                   }
| j         j        dk    r>|<|j        j        dv r.t          j        |          j        }t          j        |
|          }
|
S )Nflex_attentionr   r   )rZ   deviceflash_attention_2rL   Fsdpa)inputs_embedsr0   is_trainingrU   )sequence_lengthtarget_lengthdtyper   
batch_size)cudaxpunpu)rm   r   r   r5   rK   r!   onesr   r   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdparY   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r+   r/   r   r   r{   past_seen_tokensusing_compilable_cacher   r   r   causal_mask	min_dtypes               r-   _update_causal_maskz)BioGptPreTrainedModel._update_causal_maske  s0    ;+/???.%,77 
!<^!L!L  '!<J*03\5G5JK-4  " " "!;+/BBB)~/D.I.I.K.K)%%4
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD
 E**.I0CKQZ[[Kr.   r   r   r   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer   r   r   )diagonalr   rU   r   )r4   r5   r   r   fullr   triuaranger   expandcloner   tomasked_fill)r/   r   r   r   r   r   rb   r   r   mask_lengthpadding_masks              r-   r   zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r.   N)r9   r:   r;   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r5   rK   r   r   staticmethodr=   r   r   r   r.   r-   r   r   Y  s          &*#N!J u|['@!ABJ lJ 	J
 J J J JX 444 4 {	4
 4 4 4 4 \4 4 4r.   r   c                   d    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 deej	                 dee
         d	ee         d
eej                 dee         dee         dee         deej                 dee         deeef         fd            Z xZS )BioGptModelrm   c                 j   t                                                     | _        j        | _        j        | _        j        | _        j        | _	        j
        rt          j        j                  nd}t          j        | j        | j	        |          | _        t!          j        | j                  | _        t'          j        fdt+          j                  D                       | _        t'          j        | j                  | _        d| _        |                                  d S )NrC   )rE   c                 2    g | ]}t          |           S ))rn   )r   ).0irm   s     r-   
<listcomp>z(BioGptModel.__init__.<locals>.<listcomp>  s(    $v$v$vQR%7!%L%L%L$v$v$vr.   F)r)   r*   rm   	layerdropr   rR   r   rh   pad_token_idrD   scale_embeddingmathsqrtrB   
vocab_sizeembed_tokensr#   max_position_embeddingsembed_positionsr]   
ModuleListrangenum_hidden_layersr   r   
layer_normgradient_checkpointing	post_init)r+   rm   rE   r,   s    ` r-   r*   zBioGptModel.__init__  s      )1+!.7=7MVdi 2333SV5t~t/?[
 
 
  @@^`d`noom$v$v$v$vV[\b\tVuVu$v$v$vww,t~66&+#r.   NrH   r/   rS   r   r{   r   r1   r   output_hidden_statesreturn_dictr   rb   r   c                    ||n| j         j        }|	|	n| j         j        }	||n| j         j        }|
|
n| j         j        }
|d u |d uz  rt          d          |&|}|j        }|                    d|d                   }n=|,|                                d d         }|d d d d df         }nt          d          || 	                    |          }| j
        r%| j        r|rt                              d           d}|r|t          | j                   }|rCt          |t                     r.t                              d           t          j        |          }|                                d d         \  }}||                                nd}|t'          j        |||z   |j        	          }|!||z   }t'          j        |||j        	          }|}|                     ||||          }|>t'          j        |d
          }||z  d
z
                                  }|d d |d f         }|                     |||          }||z   }t6          j                            || j        | j                  }| j
        r%| j        r|rt                              d           d}|	rdnd }|rdnd }d }t=          | j                  D ]e\  }}|	r||fz  }| j        r t'          j         g           }|| j!        k     r4 ||f||||         nd |||||d|}|d         }|r||d
         fz  }f|	r||fz  }| "                    |          }|
st!          d |||||fD                       S tG          |||||          S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timerU   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...F)rm   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   r3   )r1   rW   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )r/   r   r{   r   r   r1   r   c              3      K   | ]}||V  	d S rG   r   )r   vs     r-   	<genexpr>z&BioGptModel.forward.<locals>.<genexpr>  s0        =  === r.   )last_hidden_stater{   r   
attentionscross_attentions)$rm   r   r  r   use_return_dictrr   r   r`   rZ   r   r  rY   rs   rt   r   r   r   from_legacy_cacher   r5   r   r   r   r   r6   r7   r  r]   r^   rR   	enumerater   randr   r  r   )r+   rH   r/   rS   r   r{   r   r1   r   r  r	  r   rb   inputinput_shaper   
seq_lengthr0   mask_seq_lengthself_attn_cacher   	positionsr   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                                 r-   r8   zBioGptModel.forward  s     2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] -t";< 
	fsttt"E+K!r;r?;;II&',,..ss3K!!!!QQQ(+EEdeee  --e44M& 	"4= 	" "##q   "	  	?0*$+>>>O 	NOU;; 	NU  
 +<_MMO!.!3!3!5!5crc!:
JETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !4zAO"Z
OML`aaaN)..	
 
  <A>>>L(>9A=CCEEL'+A+B+B(BCL((9O^j(kk	%	1--mt|VZVc-dd& 	"4= 	" "##p   "	"6@BBD0:d#"+DK"8"8 	6 	6C# 6!m%55!} &+jnn#&77)M
*3<3H3d /"3#)-
 
 
 
M *!,M  6=#3"55   	2-!1166 	  ':K^]qr     
 9+++%1
 
 
 	
r.   )NNNNNNNNNNN)r9   r:   r;   r   r*   r   r   r5   r>   r   r   r   rK   r   r   r   r   r   r8   r?   r@   s   @r-   r   r     sv       |      *  156:1559+/$(37,0/3&*15P
 P
E,-P
 !!23P
 E-.	P

   12P
 "%P
 D>P
 u/0P
 $D>P
 'tnP
 d^P
 !.P
 +,P
 
u??	@P
 P
 P
 ^P
 P
 P
 P
 P
r.   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j                 d	ee	j                 d
ee         dee	j
                 dee         dee	j
                 dee         dee         dee         dee	j                 dee         deeef         fd            Z xZS )BioGptForCausalLMzoutput_projection.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S NFrp   )
r)   r*   r   r   r]   ru   r   r   output_projectionr  r+   rm   r,   s     r-   r*   zBioGptForCausalLM.__init__  sb       !&))!#6+=v?PW\!]!]!] 	r.   c                     | j         S rG   r'  r+   s    r-   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%r.   c                     || _         d S rG   r*  )r+   new_embeddingss     r-   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s    !/r.   NrH   r/   rS   r   r{   labelsr   r1   r   r  r	  r   rb   r   c                 R   ||n| j         j        } | j        |f|||||||	|
||d
|}|d         }|                     |          }d}| | j        ||fd| j         j        i|}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r/   rS   r   r{   r   r1   r   r  r	  r   r   r   r   )losslogitsr{   r   r  r  )rm   r  r   r'  loss_functionr   r   r{   r   r  r  )r+   rH   r/   rS   r   r{   r0  r   r1   r   r  r	  r   rb   r   sequence_outputprediction_scoreslm_lossoutputs                      r-   r8   zBioGptForCausalLM.forward  s-   . &1%<kk$+B]$+
)'+%/!5#)
 
 
 
 "!* 22?CC(d(!   ;1 	 G  	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
r.   )NNNNNNNNNNNN)r9   r:   r;   _tied_weights_keysr*   r,  r/  r   r   r5   r>   r   r   r   rK   r   r   r   r   r   r8   r?   r@   s   @r-   r$  r$    s        55    & & &0 0 0  156:1559+/-1$(37,0/3&*15>
 >
E,->
 !!23>
 E-.	>

   12>
 "%>
 )*>
 D>>
 u/0>
 $D>>
 'tn>
 d^>
 !.>
 +,>
 
u77	8>
 >
 >
 ^>
 >
 >
 >
 >
r.   r$  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee	         deej                 d	eej                 d
ee
         deej                 dee
         dee
         dee
         deej                 deeef         fd            Z xZS )BioGptForTokenClassificationc                 x   t                                          |           |j        | _        t          |          | _        t          |d          r|j        |j        }n|j        }t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S )Nclassifier_dropout)r)   r*   
num_labelsr   r   hasattrr=  r   r]   DropoutrR   ru   r   
classifierr  )r+   rm   r=  r,   s      r-   r*   z%BioGptForTokenClassification.__init__  s        +!&))6/00 	<V5N5Z!'!:!'!;z"455)F$68IJJr.   NrH   token_type_idsr/   rS   r{   r   r0  r   r1   r   r  r	  r   r   c                    ||n| j         j        }|                     |||||||	|
|||          }|d         }|                     |          }|                     |          }d}|t                      }||                    d          dk    }|                    d| j                  }t          j	        ||                    d          t          j
        |j                                      |                    } |||          }n8 ||                    d| j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N
r{   r/   rS   r   r   r1   r   r  r	  r   r   rU   r   r'   )r2  r3  r   r  )rm   r  r   rR   rA  r   r`   r>  r5   wheretensorignore_indextype_asr   r   r  )r+   rH   rB  r/   rS   r{   r   r0  r   r1   r   r  r	  r   transformer_outputsr   r3  r2  loss_fctactive_lossactive_logitsactive_labelsr8  s                          r-   r8   z$BioGptForTokenClassification.forward  s   . &1%<kk$+B]"kk+)'%/!5#) * 
 
 ,A.]33//'))H),11"55: &B @ @ %R%,x?T2U2U2]2]^d2e2e! !  x}==xB @ @&++b//RR 	FY!4QRR!88F)-)9TGf$$vE$-;*5	
 
 
 	
r.   )NNNNNNNNNNNNN)r9   r:   r;   r*   r   r   r5   r>   r   r   r   rK   r   r   r   r8   r?   r@   s   @r-   r;  r;    s             15596:15+/59-1$(37,0/3&*15A
 A
E,-A
 !!12A
 !!23	A

 E-.A
 "%A
   12A
 )*A
 D>A
 u/0A
 $D>A
 'tnA
 d^A
 !.A
 
u++	,A
 A
 A
 ^A
 A
 A
 A
 A
r.   r;  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                        e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 dee
         d	eej	                 d
eej                 dee         deej                 dee         dee         dee         deej                 deeej        f         deeef         fd            Zd Zd Z xZS )BioGptForSequenceClassificationrm   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        | j        d          | _        | 	                                 d S r&  )
r)   r*   r>  r   r   r]   ru   r   scorer  r(  s     r-   r*   z(BioGptForSequenceClassification.__init__R  si        +!&))Yv14?OOO
 	r.   Nr   rH   r/   rS   r{   r   r0  r   r1   r   r  r	  r   logits_to_keepr   c                    ||n| j         j        }|                     ||||||||	|
||          }|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }||j        dd         \  }}n|j        dd         \  }}| j         j        d}ny|Nt          j
        || j         j                                      d          dz
                      |j                  }n)d}t                              | j        j         d           |t          j        ||j                  |f         }d}|Z| j         j        f| j        dk    rd	| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd
| j         _        nd| j         _        | j         j        d	k    rWt/                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        d
k    rGt3                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt7                      } |||          }|s|f|dd         z   }||f|z   n|S t9          |||j        |j        |j                  S )rD  NrE  r   r'   rU   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r2  r3  r{   r   r  ) rm   r  r   r   r=   slicerR  r   r   r5   nesumr   r   rs   rt   r,   r9   r   problem_typer>  r   r7   r   squeezer   r`   r   r   r{   r   r  )r+   rH   r/   rS   r{   r   r0  r   r1   r   r  r	  r   rS  rJ  r   slice_indicesr3  r   r   pooled_logitsr2  rK  r8  s                           r-   r8   z'BioGptForSequenceClassification.forward[  s?   . &1%<kk$+B]"kk+)'%/!5#) * 
 
 ,A.8B>SV8W8Wk~ot444]kM!!!]AAA*=>?? *3/"1"*='J*7*=bqb*A'J;#+ OO$#(8It{7O#P#P#T#TUW#X#X[\#\"`"`agan"o"o"$##~. ^ ^ ^  
 u|Jv}MMM^_{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r.   c                     | j         j        S rG   r   r   r+  s    r-   get_input_embeddingsz4BioGptForSequenceClassification.get_input_embeddings  s    {''r.   c                     || j         _        d S rG   r`  )r+   rP   s     r-   set_input_embeddingsz4BioGptForSequenceClassification.set_input_embeddings  s    #(   r.   )NNNNNNNNNNNNr   )r9   r:   r;   r   r*   r   r   r5   r>   r   r   r   rK   r   r=   r   r   r8   ra  rc  r?   r@   s   @r-   rP  rP  C  s       |        156:15+/59-1$(37,0/3&*1534\
 \
E,-\
 !!23\
 E-.	\

 "%\
   12\
 )*\
 D>\
 u/0\
 $D>\
 'tn\
 d^\
 !.\
 c5</0\
 
u66	7\
 \
 \
 ^\
|( ( () ) ) ) ) ) )r.   rP  )r$  r;  rP  r   r   )NrL   N)Ar   typingr   r   r   r5   torch.nnr]   r   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   configuration_biogptr   integrations.flex_attentionr    r!   
get_loggerr9   rs   	Embeddingr#   rB   ModulerK   rJ   re   rg   r   r   r   r$  r;  rP  __all__r   r.   r-   <module>rw     s  ,  , , , , , , , , , ,        A A A A A A A A A A ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) > > > > > > B B B B B B 9 9 9 9 9 9            G F F F F F F F & & & & & & ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 0 0 0 0 0 0 . . . . . .  !! VUUUUUUUU 
	H	%	%; ; ; ; ;r| ; ; ;8
= 
= 
= 
= 
= 
= 
= 
=&  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<}) }) }) }) })bi }) }) })@X X X X X3 X X Xv M M M M MO M M M` g
 g
 g
 g
 g
' g
 g
 g
T   
Q
 Q
 Q
 Q
 Q
- Q
 Q
 
Q
h Q
 Q
 Q
 Q
 Q
#8 Q
 Q
 Q
h   m) m) m) m) m)&; m) m) m)`  r.   