
     `i                        d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+  e             rddl,m-Z-m.Z.  G d de)          Z/ G d de'          Z0 G d de%          Z1 G d de&          Z2e G d de                      Z3e G d d e3                      Z4 ed!"           G d# d$e3e                      Z5e G d% d&e3                      Z6 ed'"           G d( d)e3                      Z7g d*Z8dS )+zPyTorch BioGPT model.    N)OptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogger)deprecate_kwarg   )BartAttentionBartDecoderLayerBartScaledWordEmbedding)OPTLearnedPositionalEmbedding   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                   R     e Zd Z	 	 ddej        dedeej                 f fdZ xZS ) BioGptLearnedPositionalEmbeddingr   Nattention_maskpast_key_values_lengthposition_idsc                 N    t                                          |||           dS )z3`input_ids_shape` is expected to be [bsz x seqlen].N)superforward)selfr$   r%   r&   	__class__s       }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/biogpt/modular_biogpt.pyr)   z(BioGptLearnedPositionalEmbedding.forward;   s&     	(>MMMMM    )r   N)	__name__
__module____qualname__torch
LongTensorintr   r)   __classcell__r+   s   @r,   r#   r#   :   s         '(37	N N(N !$N u/0	N N N N N N N N N Nr-   r#   c                       e Zd ZdS )BioGptScaledWordEmbeddingNr.   r/   r0    r-   r,   r7   r7   E           Dr-   r7   c                       e Zd ZdS )BioGptAttentionNr8   r9   r-   r,   r<   r<   I   r:   r-   r<   c                   z    e Zd Zddedee         f fdZ eddd          	 	 	 	 	 	 	 ddej	        deej	                 deej	                 dee
         dee         dee         deej                 deej	                 dee         deej        eeej        ej        f                  f         fd            Z xZS )BioGptDecoderLayerNconfig	layer_idxc           	         t                                          |           |j        | _        t	          | j        |j        |j        dd||          | _        |j        | _	        t          |j                 | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        | `| `d S )NT)	embed_dim	num_headsdropout
is_decoder	is_causalr?   r@   )r(   __init__hidden_sizerB   r<   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrD   r	   
hidden_actactivation_fnnnLinearintermediate_sizefc1fc2encoder_attnencoder_attn_layer_norm)r*   r?   r@   r+   s      r,   rG   zBioGptDecoderLayer.__init__N   s       +(n07
 
 
 1#F$569T^V-EFF9V5t~FF(((r-   past_key_valuepast_key_valuesz4.58)new_nameversionFThidden_statesr$   layer_head_maskoutput_attentions	use_cacher&   cache_positionkwargsreturnc	                 B   |}
|                      |          } | j        d|||||||d|	\  }}t          j                            || j        | j                  }|
|z   }|}
|                     |          }|                     |          }|                     |          }t          j                            || j	        | j                  }| 
                    |          }t          j                            || j        | j                  }|
|z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )rZ   rW   r$   r[   r\   r&   r^   ptrainingr9   )self_attn_layer_normrK   rO   
functionalrD   rd   final_layer_normrR   rN   activation_dropoutrS   )r*   rZ   r$   r[   rW   r\   r]   r&   r^   r_   residualself_attn_weightsoutputss                r,   r)   zBioGptDecoderLayer.forwardd   sO   > !11-@@ ,:4> 	,
'+)+/%)	,
 	,
 	,
 	,
(( --mt|VZVc-dd =0 !--m<<//**=99--mt?Vaean-oo//--mt|VZVc-dd =0 " 	,)++Gr-   N)NNNFTNN)r.   r/   r0   r   r   r3   rG   r   r1   Tensorr
   boolr2   r   r   tupleFloatTensorr)   r4   r5   s   @r,   r>   r>   M   sZ       ) )| ) ) ) ) ) ) ), _%0A6RRR 2626+/,1$(3715? ?|? !.? "%,/	?
 "%? $D>? D>? u/0? !.? +,? 
u (51BEDU1U+V"WW	X? ? ? SR? ? ? ? ?r-   r>   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
deeej        df                  dej        dej        defd	Zedej        d
ededej        dej        defd            ZdS )BioGptPreTrainedModelr?   biogptTr$   r    input_tensorr^   rW   c           	         | j         j        dk    rnt          |t          j                  rt          |          }nB|@t          t          j        |j        d         |j        d         f|j                            }|S | j         j        dk    r||dk    	                                r|S d S ||
                                nd}||j        nd}| j         j        dk    r!|st          j        |||| j        	          rd S |j        }|j        d         }|r|                                }	n/t          |t          j                  r|j        d
         n||z   dz   }	|                     |||	|||j        d                   }
| j         j        dk    r>|<|j        j        dv r.t          j        |          j        }t          j        |
|          }
|
S )Nflex_attentionr   r   )sizedeviceflash_attention_2g        Fsdpa)inputs_embedsr%   is_training)sequence_lengthtarget_lengthdtyper^   
batch_size)cudaxpunpu)r?   _attn_implementation
isinstancer1   rm   r!   onesshaperx   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdpard   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r*   r$   rt   r^   rW   past_seen_tokensusing_compilable_cacher   r~   r   causal_mask	min_dtypes               r,   _update_causal_maskz)BioGptPreTrainedModel._update_causal_mask   s0    ;+/???.%,77 
!<^!L!L  '!<J*03\5G5JK-4  " " "!;+/BBB)~/D.I.I.K.K)%%4
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD
 E**.I0CKQZ[[Kr-   r~   r   r   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer   rx   r   )diagonalrx   r}   r   )dimr1   r   r   fullrx   triuarangereshapeexpandcloner   tomasked_fill)r$   r~   r   r   r^   r   r_   r   r   mask_lengthpadding_masks              r,   r   zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position   s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r-   N)r.   r/   r0   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r1   rm   r
   r   staticmethodr3   r   r   r9   r-   r,   rr   rr      s          &*#N!J u|['@!ABJ lJ 	J
 J J J JX 444 4 {	4
 4 4 4 4 \4 4 4r-   rr   c                   d    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 deej	                 dee
         d	ee         d
eej                 dee         dee         dee         deej                 dee         deeef         fd            Z xZS )BioGptModelr?   c                 j   t                                                     | _        j        | _        j        | _        j        | _        j        | _	        j
        rt          j        j                  nd}t          j        | j        | j	        |          | _        t!          j        | j                  | _        t'          j        fdt+          j                  D                       | _        t'          j        | j                  | _        d| _        |                                  d S )Ng      ?)embed_scalec                 2    g | ]}t          |           S ))r@   )r>   ).0ir?   s     r,   
<listcomp>z(BioGptModel.__init__.<locals>.<listcomp>H  s(    $v$v$vQR%7!%L%L%L$v$v$vr-   F)r(   rG   r?   	layerdroprL   rD   rH   rB   pad_token_idpadding_idxscale_embeddingmathsqrtr7   
vocab_sizeembed_tokensr#   max_position_embeddingsembed_positionsrO   
ModuleListrangenum_hidden_layerslayers	LayerNorm
layer_normgradient_checkpointing	post_init)r*   r?   r   r+   s    ` r,   rG   zBioGptModel.__init__:  s      )1+!.7=7MVdi 2333SV5t~t/?[
 
 
  @@^`d`noom$v$v$v$vV[\b\tVuVu$v$v$vww,t~66&+#r-   N	input_idsr$   	head_maskr{   rW   r]   r&   r\   output_hidden_statesreturn_dictr^   r_   r`   c                    ||n| j         j        }|	|	n| j         j        }	||n| j         j        }|
|
n| j         j        }
|d u |d uz  rt          d          |&|}|j        }|                    d|d                   }n=|,|                                d d         }|d d d d df         }nt          d          || 	                    |          }| j
        r| j        r|rt          j        d           d}|r|t          | j                   }|r=t          |t                     r(t          j        d           t          j        |          }|                                d d         \  }}||                                nd}|t'          j        |||z   |j        	          }|!||z   }t'          j        |||j        	          }|}|                     ||||          }|>t'          j        |d
          }||z  d
z
                                  }|d d |d f         }|                     |||          }||z   }t6          j                            || j        | j                  }| j
        r| j        r|rt          j        d           d}|	rdnd }|rdnd }d }t=          | j                  D ]e\  }}|	r||fz  }| j        r t'          j         g           }|| j!        k     r4 ||f||||         nd |||||d|}|d         }|r||d
         fz  }f|	r||fz  }| "                    |          }|
st!          d |||||fD                       S tG          |||||          S )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer}   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...F)r?   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   )r   )r&   rb   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r9   )r$   r[   rW   r\   r]   r&   r^   c              3      K   | ]}||V  	d S rl   r9   )r   vs     r,   	<genexpr>z&BioGptModel.forward.<locals>.<genexpr>  s0        =  === r-   )last_hidden_staterW   rZ   
attentionscross_attentions)$r?   r\   r   r]   use_return_dict
ValueErrorr   viewrw   r   r   rd   r   warning_oncer   r   ro   from_legacy_cacher   r1   r   rx   r   r   cumsumlongr   rO   rf   rD   	enumerater   randr   r   r   )r*   r   r$   r   r{   rW   r]   r&   r\   r   r   r^   r_   inputinput_shaper   
seq_lengthr%   mask_seq_lengthself_attn_cacher   	positionsrZ   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                                 r,   r)   zBioGptModel.forwardO  s     2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] -t";< 
	fsttt"E+K!r;r?;;II&',,..ss3K!!!!QQQ(+EEdeee  --e44M& 	"4= 	" "#q   "	  	?0*$+>>>O 	NOU;; 	NU  
 +<_MMO!.!3!3!5!5crc!:
JETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !4zAO"Z
OML`aaaN)..	
 
  <A>>>L(>9A=CCEEL'+A+B+B(BCL((9O^j(kk	%	1--mt|VZVc-dd& 	"4= 	" "#p   "	"6@BBD0:d#"+DK"8"8 	6 	6C# 6!m%55!} &+jnn#&77)M
*3<3H3d /"3#)-
 
 
 
M *!,M  6=#3"55   	2-!1166 	  ':K^]qr     
 9+++%1
 
 
 	
r-   )NNNNNNNNNNN)r.   r/   r0   r   rG   r   r   r1   r2   rp   r
   rn   rm   r   r   r   ro   r   r)   r4   r5   s   @r,   r   r   8  sv       |      *  156:1559+/$(37,0/3&*15P
 P
E,-P
 !!23P
 E-.	P

   12P
 "%P
 D>P
 u/0P
 $D>P
 'tnP
 d^P
 !.P
 +,P
 
u??	@P
 P
 P
 ^P
 P
 P
 P
 P
r-   r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j                 d	ee	j                 d
ee         dee	j
                 dee         dee	j
                 dee         dee         dee         dee	j                 dee         deeef         fd            Z xZS )BioGptForCausalLMzoutput_projection.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S NF)bias)
r(   rG   r   rs   rO   rP   rH   r   output_projectionr   r*   r?   r+   s     r,   rG   zBioGptForCausalLM.__init__  sb       !&))!#6+=v?PW\!]!]!] 	r-   c                     | j         S rl   r   r*   s    r,   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%r-   c                     || _         d S rl   r   )r*   new_embeddingss     r,   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s    !/r-   Nr   r$   r   r{   rW   labelsr]   r&   r\   r   r   r^   r_   r`   c                 R   ||n| j         j        } | j        |f|||||||	|
||d
|}|d         }|                     |          }d}| | j        ||fd| j         j        i|}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r$   r   r{   rW   r]   r&   r\   r   r   r^   r   r   r   )losslogitsrW   rZ   r   r   )r?   r   rs   r   loss_functionr   r   rW   rZ   r   r   )r*   r   r$   r   r{   rW   r   r]   r&   r\   r   r   r^   r_   rk   sequence_outputprediction_scoreslm_lossoutputs                      r,   r)   zBioGptForCausalLM.forward  s-   . &1%<kk$+B]$+
)'+%/!5#)
 
 
 
 "!* 22?CC(d(!   ;1 	 G  	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
r-   )NNNNNNNNNNNN)r.   r/   r0   _tied_weights_keysrG   r   r   r   r   r1   r2   rp   r
   rn   rm   r   r   r   ro   r   r)   r4   r5   s   @r,   r   r     s        55    & & &0 0 0  156:1559+/-1$(37,0/3&*15>
 >
E,->
 !!23>
 E-.	>

   12>
 "%>
 )*>
 D>>
 u/0>
 $D>>
 'tn>
 d^>
 !.>
 +,>
 
u77	8>
 >
 >
 ^>
 >
 >
 >
 >
r-   r   c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee	         deej                 d	eej                 d
ee
         deej                 dee
         dee
         dee
         deej                 deeef         fd            Z xZS )BioGptForTokenClassificationc                 x   t                                          |           |j        | _        t          |          | _        t          |d          r|j        |j        }n|j        }t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S )Nclassifier_dropout)r(   rG   
num_labelsr   rs   hasattrr  rL   rO   DropoutrD   rP   rH   
classifierr   )r*   r?   r  r+   s      r,   rG   z%BioGptForTokenClassification.__init__>  s        +!&))6/00 	<V5N5Z!'!:!'!;z"455)F$68IJJr-   Nr   token_type_idsr$   r   rW   r{   r   r]   r&   r\   r   r   r^   r`   c                    ||n| j         j        }|                     |||||||	|
|||          }|d         }|                     |          }|                     |          }d}|t                      }||                    d          dk    }|                    d| j                  }t          j	        ||                    d          t          j
        |j                                      |                    } |||          }n8 ||                    d| j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N
rW   r$   r   r{   r]   r&   r\   r   r   r^   r   r}   r   r   )r   r   rZ   r   )r?   r   rs   rD   r  r   r   r  r1   wheretensorignore_indextype_asr   rZ   r   )r*   r   r  r$   r   rW   r{   r   r]   r&   r\   r   r   r^   transformer_outputsrZ   r   r   loss_fctactive_lossactive_logitsactive_labelsr  s                          r,   r)   z$BioGptForTokenClassification.forwardL  s   . &1%<kk$+B]"kk+)'%/!5#) * 
 
 ,A.]33//'))H),11"55: &B @ @ %R%,x?T2U2U2]2]^d2e2e! !  x}==xB @ @&++b//RR 	FY!4QRR!88F)-)9TGf$$vE$-;*5	
 
 
 	
r-   )NNNNNNNNNNNNN)r.   r/   r0   rG   r   r   r1   r2   rp   r
   rn   rm   r   ro   r   r)   r4   r5   s   @r,   r  r  <  s             15596:15+/59-1$(37,0/3&*15A
 A
E,-A
 !!12A
 !!23	A

 E-.A
 "%A
   12A
 )*A
 D>A
 u/0A
 $D>A
 'tnA
 d^A
 !.A
 
u++	,A
 A
 A
 ^A
 A
 A
 A
 A
r-   r  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                        e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej	                 dee
         d	eej	                 d
eej                 dee         deej                 dee         dee         dee         deej                 deeej        f         deeef         fd            Zd Zd Z xZS )BioGptForSequenceClassificationr?   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        | j        d          | _        | 	                                 d S r   )
r(   rG   r  r   rs   rO   rP   rH   scorer   r   s     r,   rG   z(BioGptForSequenceClassification.__init__  si        +!&))Yv14?OOO
 	r-   Nr   r   r$   r   rW   r{   r   r]   r&   r\   r   r   r^   logits_to_keepr`   c                    ||n| j         j        }|                     ||||||||	|
||          }|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }||j        dd         \  }}n|j        dd         \  }}| j         j        d}ns|Nt          j
        || j         j                                      d          dz
                      |j                  }n#d}t          j        | j        j         d           |t          j        ||j                  |f         }d}|Z| j         j        f| j        dk    rd	| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd
| j         _        nd| j         _        | j         j        d	k    rWt/                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        d
k    rGt3                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt7                      } |||          }|s|f|dd         z   }||f|z   n|S t9          |||j        |j        |j                  S )r  Nr  r   r   r}   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   rW   rZ   r   ) r?   r   rs   r   r3   slicer  r   r   r1   nesumr   rx   r   r   r+   r.   r   problem_typer  r   r   r   squeezer   r   r   r   rW   rZ   r   )r*   r   r$   r   rW   r{   r   r]   r&   r\   r   r   r^   r  r  rZ   slice_indicesr   r   r~   pooled_logitsr   r  r  s                           r,   r)   z'BioGptForSequenceClassification.forward  s<   . &1%<kk$+B]"kk+)'%/!5#) * 
 
 ,A.8B>SV8W8Wk~ot444]kM!!!]AAA*=>?? *3/"1"*='J*7*=bqb*A'J;#+ OO$#(8It{7O#P#P#T#TUW#X#X[\#\"`"`agan"o"o"$#~. ^ ^ ^  
 u|Jv}MMM^_{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r-   c                     | j         j        S rl   rs   r   r   s    r,   get_input_embeddingsz4BioGptForSequenceClassification.get_input_embeddings  s    {''r-   c                     || j         _        d S rl   r*  )r*   values     r,   set_input_embeddingsz4BioGptForSequenceClassification.set_input_embeddings  s    #(   r-   )NNNNNNNNNNNNr   )r.   r/   r0   r   rG   r   r   r1   r2   rp   r
   rn   rm   r   r3   ro   r   r)   r+  r.  r4   r5   s   @r,   r  r    s       |        156:15+/59-1$(37,0/3&*1534\
 \
E,-\
 !!23\
 E-.	\

 "%\
   12\
 )*\
 D>\
 u/0\
 $D>\
 'tn\
 d^\
 !.\
 c5</0\
 
u66	7\
 \
 \
 ^\
|( ( () ) ) ) ) ) )r-   r  )r   r  r  r   rr   )9__doc__r   typingr   r   r1   torch.nnrO   r   r   r   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   bart.modeling_bartr   r   r   opt.modeling_optr   configuration_biogptr   integrations.flex_attentionr    r!   r#   r7   r<   r>   rr   r   r   r  r  __all__r9   r-   r,   <module>r@     s;      " " " " " " " "        A A A A A A A A A A ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) )                 . - - - - - & & & & & &            1 0 0 0 0 0         
 = < < < < < . . . . . .  !! VUUUUUUUUN N N N N'D N N N	 	 	 	 	 7 	 	 		 	 	 	 	m 	 	 	W W W W W) W W Wt M M M M MO M M M` g
 g
 g
 g
 g
' g
 g
 g
T   
Q
 Q
 Q
 Q
 Q
- Q
 Q
 
Q
h Q
 Q
 Q
 Q
 Q
#8 Q
 Q
 Q
h   m) m) m) m) m)&; m) m) m)`  r-   