
     `i1                       d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3  e/j4        e5          Z6d Z7 G d de
j8                  Z9 G d de
j8                  Z: G d de:          Z; G d de
j8                  Z<e:e;dZ= G d de
j8                  Z> G d  d!e
j8                  Z? G d" d#e
j8                  Z@ G d$ d%e          ZA G d& d'e
j8                  ZB G d( d)e
j8                  ZC G d* d+e
j8                  ZD G d, d-e
j8                  ZE G d. d/e
j8                  ZF G d0 d1e
j8                  ZG G d2 d3e
j8                  ZHe. G d4 d5e'                      ZIe e.d67           G d8 d9e-                                  ZJ e.d:7           G d; d<eI                      ZK e.d=7           G d> d?eI                      ZL e.d@7           G dA dBeIe                      ZMe. G dC dDeI                      ZN e.dE7           G dF dGeI                      ZO e.dH7           G dI dJeI                      ZPe. G dK dLeI                      ZQe. G dM dNeI                      ZRe. G dO dPeI                      ZSg dQZTdS )RzPyTorch BERT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging)deprecate_kwarg   )
BertConfigc           	      .   	 ddl }ddl}ddl}n)# t          $ r t                              d            w xY wt          j                            |          }t          	                    d|            |j
                            |          }g }g }	|D ]j\  }
}t          	                    d|
 d|            |j
                            ||
          }|                    |
           |	                    |           kt          ||	          D ]\  }
}|
                    d          }
t!          d |
D                       r1t          	                    d	d                    |
                      e| }|
D ]H}|                    d
|          r|                    d|          }n|g}|d         dk    s|d         dk    rt'          |d          }n|d         dk    s|d         dk    rt'          |d          }n|d         dk    rt'          |d          }nv|d         dk    rt'          |d          }nY	 t'          ||d                   }nA# t(          $ r4 t          	                    d	d                    |
                      Y w xY wt+          |          dk    rt-          |d                   }||         }J|dd         dk    rt'          |d          }n|dk    r|                    |          }	 |j        |j        k    r t3          d|j         d|j         d          n/# t2          $ r"}|xj        |j        |j        fz  c_         d}~ww xY wt          	                    d|
            t7          j        |          |_        | S )z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3      K   | ]}|d v V  	dS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py	<genexpr>z*load_tf_weights_in_bert.<locals>.<genexpr>S   s<       
 
 nn
 
 
 
 
 
    z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r$   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr?   nptftf_path	init_varsnamesarraysnamerW   arraypointerm_namescope_namesnumes                     r1   load_tf_weights_in_bertrm   6   s   
			   Q	
 	
 	
 	 goo011G
KKBBBCCC''00IEF   eBBB5BBCCC&&w55Te5&)) ,/ ,/ezz#  
 

 
 
 
 
 	 KK4CHHTNN44555 	' 	'F||,f55 ' hhy&99%h1~))[^w-F-F!'844Q=00KNf4L4L!'622Q#333!'844Q7**!'<88%g{1~>>GG%   KK <CHHTNN < <===H ;1$$+a.))!#,#$$<=((gx00GGxLL''E	}++ !j'-!j!jRWR]!j!j!jkkk , 	 	 	FFw}ek22FF	 	777888'..Ls2    &58I:JJ=0L..
M8MMc                        e Zd ZdZ fdZ	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	e	d
ej
        fdZ xZS )BertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r$   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrR   rt   register_bufferrZ   arangeexpandzerosrv   sizelongselfr^   	__class__s     r1   r}   zBertEmbeddings.__init__   sK   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
r3   Nr   	input_idsry   rv   inputs_embedspast_key_values_lengthreturnc                    ||                                 }n|                                 d d         }|d         }|| j        d d |||z   f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nrw   r$   ry   r   r{   deviceru   )r   rv   hasattrry   r   rZ   r   r   r   r   r   rt   r   r   r   )r   r   ry   rv   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r1   forwardzBertEmbeddings.forward   sm     #..**KK',,..ss3K ^
,QQQ0FVlIl0l-lmL
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r3   )NNNNr   )__name__
__module____qualname____doc__r}   r   rZ   
LongTensorFloatTensorrU   Tensorr   __classcell__r   s   @r1   ro   ro      s        QQ
 
 
 
 
* 15593759&'' 'E,-' !!12' u/0	'
   12' !$' 
' ' ' ' ' ' ' 'r3   ro   c                       e Zd Zd fd	Z eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 dee	         dee
         deej                 deej                 fd            Z xZS )BertSelfAttentionNc                 R   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r6|j        | _        t          j        d
|j        z  dz
  | j                  | _        |j        | _        || _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rt   ru   relative_keyrelative_key_queryr=   r$   )r|   r}   r   num_attention_headsr   rX   rU   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rR   rt   r   r~   distance_embedding
is_decoder	layer_idxr   r^   rt   r   r   s       r1   r}   zBertSelfAttention.__init__   s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD# +"r3   past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionr   c                    |j         \  }}	}
|                     |          }|                    |d| j        | j                                      dd          }d}|d u}|Ht          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                    |d| j        | j                                      dd          }|                     |          }|                    |d| j        | j                                      dd          }|N|s|nd }|                    ||| j
        d|i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }| j        dk    s| j        d	k    rt|j         d         |j         d         }}|>t'          j        |dz
  t&          j        |j        
                              dd          }n:t'          j        |t&          j        |j        
                              dd          }t'          j        |t&          j        |j        
                              dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt'          j        d||          }||z   }n?| j        d	k    r4t'          j        d||          }t'          j        d||          }||z   |z   }|t?          j         | j                  z  }|||z   }tB          j"        #                    |d          }| $                    |          }|||z  }t'          j        ||          }|%                    dddd          &                                }|'                                d d         | j(        fz   }|                    |          }||fS )Nrw   r$   r=   Fr   Tr   r   r   rz   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r
   ))rW   r   viewr   r   rV   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updaterZ   matmulrt   tensorr   r   r   r   r   tor{   einsummathsqrtr   
functionalsoftmaxr   permute
contiguousr   r   )r   r   r   r   r   r   r   r   
batch_sizer   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r1   r   zBertSelfAttention.forward   s    %2$7!
Jjj//!&&z2t7OQUQijjttq
 
 
2$>&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK00I!z2t7OQUQijjtt1 I **^44K%**B 8$:R i1oo  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L*!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCo--r3   NNNNNNFNr   r   r   r}   r#   rZ   r   r   r   r   booltupler   r   r   s   @r1   r   r      s       # # # # # #6 _%0A6RRR 7;15=A+/,115e. e.|e. !!23e. E-.	e.
  ((9:e. "%e. $D>e. !.e. 
u|	e. e. e. SRe. e. e. e. e.r3   r   c                       e Zd Zd fd	Z eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 dee	         dee
         deej                 deej                 f fd            Z xZS )BertSdpaSelfAttentionNc                 h    t                                          |||           |j        | _        d S Nrt   r   )r|   r}   r   dropout_probr   s       r1   r}   zBertSdpaSelfAttention.__init__E  s5    9P\efff"?r3   r   r   r   r   Fr   r   r   r   r   r   r   c           	      P   | j         dk    s|s|At                              d           t                                          |||||||          S |                                \  }}	}
|                     |                              |d| j        | j	                  
                    dd          }d}|d u}|r|n|}|Ht          |t                    r1|j                            | j                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j                 j        }|j        | j                 j        }n|                     |                              |d| j        | j	                  
                    dd          }|                     |                              |d| j        | j	                  
                    dd          }|N|s|nd }|                    ||| j        d|i          \  }}|r$t          |t                    rd|j        | j        <   | j        o| o	|d u o|	dk    }t2          j        j                            ||||| j        r| j        nd	|
          }|
                    dd          }|                    ||	| j                   }|d fS )Nru   a  BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rw   r$   r=   Fr   T        )	attn_mask	dropout_p	is_causal)!rt   rC   warning_oncer|   r   r   r   r   r   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rZ   r   r   scaled_dot_product_attentiontrainingr  reshaper   )r   r   r   r   r   r   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r  attn_outputr   s                       r1   r   zBertSdpaSelfAttention.forwardJ  s    ':559J5iNcH   77??%!   (,,..Wa JJ}%%**3D4LdNfggqqrsuvww 	 
2$>2DW..-&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK ((c2t79QRR1a  

>**c2t79QRR1a  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> Oi,>(>i>UYCYi^ehi^i	h)FF$+/=Ad''c G 
 
 "++Aq11!))#w8JKKD  r3   r   r   r   r   s   @r1   r   r   D  s+       @ @ @ @ @ @
 _%0A6RRR 2615=A+/,115^! ^!|^! !.^! E-.	^!
  ((9:^! "%^! $D>^! !.^! 
u|	^! ^! ^! ^! ^! SR^! ^! ^! ^! ^!r3   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )BertSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nrr   )r|   r}   r   r   r   denser   r   r   r   r   r   s     r1   r}   zBertSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==r3   r   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S Nr  r   r   r   r   r  s      r1   r   zBertSelfOutput.forward  @    

=11]33}|'CDDr3   r   r   r   r}   rZ   r   r   r   r   s   @r1   r  r    i        > > > > >U\  RWR^        r3   r  )eagersdpac                       e Zd Zd fd	Zd Z eddd          	 	 	 	 	 	 dd	ej        d
eej	                 deej	                 deej	                 dee
         dee         deej                 deej                 fd            Z xZS )BertAttentionNc                     t                                                       t          |j                 |||          | _        t          |          | _        t                      | _        d S r   )	r|   r}   BERT_SELF_ATTENTION_CLASSES_attn_implementationr   r  outputsetpruned_headsr   s       r1   r}   zBertAttention.__init__  sc    /0KL$;
 
 
	
 %V,,EEr3   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r$   r   )rT   r   r   r   r   r%  r   r   r   r   r#  r  r   union)r   headsindexs      r1   prune_headszBertAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r3   r   r   r   r   Fr   r   r   r   r   r   r   c           	          |                      |||||||          }|                     |d         |          }	|	f|dd          z   }
|
S )Nr   r   r   r   r   r   r   r$   )r   r#  )r   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              r1   r   zBertAttention.forward  sf     yy)"7+/) ! 
 
  ;;|AFF#%QRR(88r3   r   r   )r   r   r   r}   r*  r#   rZ   r   r   r   r   r   r   r   r   r   s   @r1   r  r    s       " " " " " "; ; ;$ _%0A6RRR 7;15=A+/,115 | !!23 E-.	
  ((9: "% $D> !. 
u|	   SR    r3   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )BertIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r  )r|   r}   r   r   r   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r1   r}   zBertIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r3   r   r   c                 Z    |                      |          }|                     |          }|S r  )r  r6  r   r   s     r1   r   zBertIntermediate.forward  s,    

=1100??r3   r  r   s   @r1   r1  r1    s^        9 9 9 9 9U\ el        r3   r1  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )
BertOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r  )r|   r}   r   r   r3  r   r  r   r   r   r   r   r   s     r1   r}   zBertOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r3   r   r  r   c                     |                      |          }|                     |          }|                     ||z             }|S r  r  r  s      r1   r   zBertOutput.forward  r  r3   r  r   s   @r1   r:  r:    r  r3   r:  c                   0    e Zd Zd fd	Z eddd          	 	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 deej                 dee	         dee
         deej                 deej                 fd            Zd Z xZS )	BertLayerNc                    t                                                       |j        | _        d| _        t	          ||          | _        |j        | _        |j        | _        | j        r0| j        st          |  d          t	          |d|          | _	        t          |          | _        t          |          | _        d S )Nr$   r   z> should be used as a decoder model if cross attention is addedru   r  )r|   r}   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionrX   crossattentionr1  intermediater:  r#  r   r^   r   r   s      r1   r}   zBertLayer.__init__  s    '-'E$&vCCC +#)#= # 	q? j D!h!h!hiii"/PZfo"p"p"pD,V44 ((r3   r   r   r   r   Fr   r   r   r   encoder_attention_maskr   r   r   c	           	      h   |                      ||||||          }	|	d         }
|	dd          }| j        rV|Tt          | d          st          d|  d          |                     |
||||||          }|d         }
||dd          z   }t          | j        | j        | j        |
          }|f|z   }|S )N)r   r   r   r   r   r   r$   rE  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r,  )	rC  r   r   rX   rE  r   feed_forward_chunkrA  rB  )r   r   r   r   r   rH  r   r   r   self_attention_outputsr.  r/  cross_attention_outputslayer_outputs                 r1   r   zBertLayer.forward"  s!    "&)/+) "0 "
 "
 2!4(,? 	<4@4!122  Dd D D D  
 '+&9&9 5#&; /"3- ': ' '#  7q9 7 ;;G0#T%A4CSUe
 
  /G+r3   c                 \    |                      |          }|                     ||          }|S r  )rF  r#  )r   r.  intermediate_outputrM  s       r1   rJ  zBertLayer.feed_forward_chunkS  s2    "//0@AA{{#68HIIr3   r  )NNNNNFN)r   r   r   r}   r#   rZ   r   r   r   r   r   r   r   rJ  r   r   s   @r1   r>  r>    s.       ) ) ) ) ) ) _%0A6RRR 7;15=A>B+/,115. .|. !!23. E-.	.
  ((9:. !)): ;. "%. $D>. !.. 
u|	. . . SR.`      r3   r>  c                   H    e Zd Zd fd	Z	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee         dee	         dee	         dee	         dee	         deej                 de
eej                 ef         fdZ xZS )BertEncoderNc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 2    g | ]}t          |           S )r@  )r>  )r/   ir^   s     r1   
<listcomp>z(BertEncoder.__init__.<locals>.<listcomp>]  s&    #l#l#lqIf$B$B$B#l#l#lr3   F)	r|   r}   r^   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrG  s    ` r1   r}   zBertEncoder.__init__Z  sa    ]#l#l#l#lERXRjLkLk#l#l#lmm
&+###r3   FTr   r   r   r   rH  r   	use_cacher   output_hidden_statesreturn_dictr   r   c                    |	rdnd }|rdnd }|r| j         j        rdnd }| j        r%| j        r|rt                              d           d}|rD| j         j        r8|6t          t          | j                   t          | j                             }|rO| j         j        rCt          |t                    r.t                              d           t          j        |          }t          | j                  D ]Z\  }}|	r||fz   }|||         nd } |||||||||          }|d         }|r$||d         fz   }| j         j        r||d	         fz   }[|	r||fz   }|
st          d
 |||||fD                       S t          |||||          S )Nr.   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r^   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)rH  r   r   r   r   r$   r=   c              3      K   | ]}||V  	d S r  r.   )r/   vs     r1   r2   z&BertEncoder.forward.<locals>.<genexpr>  s4       
 
 =  !===
 
r3   )last_hidden_stater   r   
attentionscross_attentions)r^   rD  rZ  r
  rC   r  r   r   r   r   r   from_legacy_cache	enumeraterY  r   )r   r   r   r   r   rH  r   r[  r   r\  r]  r   all_hidden_statesall_self_attentionsall_cross_attentionsrT  layer_modulelayer_head_masklayer_outputss                      r1   r   zBertEncoder.forward`  sc    #7@BBD$5?bb4%6d4;;Zdrr`d& 	"4= 	" "##p   "	 	v/ 	vO4K1,dk2R2R2RT`hlhsTtTtTtuuO 	U/ 	UJPU4V4V 	U\  
 2COTTO(44 	V 	VOA|# I$58H$H!.7.CillO(L%'= /"3-	 	 	M *!,M  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	 
 
 "#%'(
 
 
 
 
 
 9+++*1
 
 
 	
r3   r  )
NNNNNNFFTN)r   r   r   r}   rZ   r   r   r   r   r   r   r   r   r   r   r   s   @r1   rQ  rQ  Y  sP       , , , , , , 7;15=A>B+/$(,1/4&*15P
 P
|P
 !!23P
 E-.	P

  ((9:P
 !)): ;P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 !.P
 
uU\"$MM	NP
 P
 P
 P
 P
 P
 P
 P
r3   rQ  c                   B     e Zd Z fdZdej        dej        fdZ xZS )
BertPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r  )r|   r}   r   r   r   r  Tanh
activationr   s     r1   r}   zBertPooler.__init__  sC    Yv163EFF
'))r3   r   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r  rp  )r   r   first_token_tensorpooled_outputs       r1   r   zBertPooler.forward  s@     +111a40

#56666r3   r  r   s   @r1   rm  rm    s^        $ $ $ $ $
U\ el        r3   rm  c                   B     e Zd Z fdZdej        dej        fdZ xZS )BertPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r  )r|   r}   r   r   r   r  r   r4  r5  r   transform_act_fnr   r   r   s     r1   r}   z$BertPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr3   r   r   c                     |                      |          }|                     |          }|                     |          }|S r  )r  rw  r   r8  s     r1   r   z#BertPredictionHeadTransform.forward  s=    

=11--m<<}55r3   r  r   s   @r1   ru  ru    sc        U U U U UU\ el        r3   ru  c                   *     e Zd Z fdZd Zd Z xZS )BertLMPredictionHeadc                 >   t                                                       t          |          | _        t	          j        |j        |j        d          | _        t	          j	        t          j        |j                            | _        | j        | j        _        d S )NF)r9   )r|   r}   ru  	transformr   r   r   r   decoder	ParameterrZ   r   r9   r   s     r1   r}   zBertLMPredictionHead.__init__  sz    4V<< y!3V5FUSSSLV->!?!?@@	 !Ir3   c                 (    | j         | j        _         d S r  )r9   r}  r   s    r1   _tie_weightsz!BertLMPredictionHead._tie_weights  s     Ir3   c                 Z    |                      |          }|                     |          }|S r  )r|  r}  r8  s     r1   r   zBertLMPredictionHead.forward  s*    }55]33r3   )r   r   r   r}   r  r   r   r   s   @r1   rz  rz    sV        & & & & && & &      r3   rz  c                   B     e Zd Z fdZdej        dej        fdZ xZS )BertOnlyMLMHeadc                 p    t                                                       t          |          | _        d S r  )r|   r}   rz  predictionsr   s     r1   r}   zBertOnlyMLMHead.__init__  s/    /77r3   sequence_outputr   c                 0    |                      |          }|S r  )r  )r   r  prediction_scoress      r1   r   zBertOnlyMLMHead.forward  s     ,,_==  r3   r  r   s   @r1   r  r    s^        8 8 8 8 8!u| ! ! ! ! ! ! ! ! !r3   r  c                   $     e Zd Z fdZd Z xZS )BertOnlyNSPHeadc                     t                                                       t          j        |j        d          | _        d S Nr=   )r|   r}   r   r   r   seq_relationshipr   s     r1   r}   zBertOnlyNSPHead.__init__  s6     "	&*<a @ @r3   c                 0    |                      |          }|S r  )r  )r   rs  seq_relationship_scores      r1   r   zBertOnlyNSPHead.forward  s    !%!6!6}!E!E%%r3   r   r   r   r}   r   r   r   s   @r1   r  r    sL        A A A A A& & & & & & &r3   r  c                   $     e Zd Z fdZd Z xZS )BertPreTrainingHeadsc                     t                                                       t          |          | _        t	          j        |j        d          | _        d S r  )r|   r}   rz  r  r   r   r   r  r   s     r1   r}   zBertPreTrainingHeads.__init__  sF    /77 "	&*<a @ @r3   c                 ^    |                      |          }|                     |          }||fS r  )r  r  )r   r  rs  r  r  s        r1   r   zBertPreTrainingHeads.forward  s6     ,,_==!%!6!6}!E!E "888r3   r  r   s   @r1   r  r    sL        A A A A A
9 9 9 9 9 9 9r3   r  c                   0    e Zd ZU eed<   eZdZdZdZ	d Z
dS )BertPreTrainedModelr^   bertTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r |j        j        	                                 dS dS )zInitialize the weightsr  )meanstdNg      ?)r   r   r   r6   r\   normal_r^   initializer_ranger9   zero_r~   rq   r   fill_rz  )r   modules     r1   _init_weightsz!BertPreTrainedModel._init_weights  sY   fbi(( 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S))))) 455 	%K""$$$$$	% 	%r3   N)r   r   r   r%   __annotations__rm   load_tf_weightsbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  r.   r3   r1   r  r  
  sG         -O&*#N% % % % %r3   r  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )BertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   rb  )r   r   r   r   r  r   rZ   r   r  r  r  r   r   rb  r.   r3   r1   r  r  %  s         	 	 )-D(5$
%,,,59x 12999;?Xe&78???8<M8E%"345<<<59Ju01299999r3   r  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c            "           e Zd ZddgZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         de	e         de	e         de	e
j                 deee
j                 ef         fd            Z xZS )	BertModelro   r>  Tc                 0   t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        |j	        | _
        |j        | _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r|   r}   r^   ro   r   rQ  encoderrm  poolerr"  attn_implementationrt   	post_init)r   r^   add_pooling_layerr   s      r1   r}   zBertModel.__init__M  s    
 	   (00"6**,=Gj(((4#)#> '-'E$ 	r3   c                     | j         j        S r  r   r   r  s    r1   get_input_embeddingszBertModel.get_input_embeddings`  s    ..r3   c                     || j         _        d S r  r  )r   r   s     r1   set_input_embeddingszBertModel.set_input_embeddingsc  s    */'''r3   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rY  rC  r*  )r   heads_to_prunerY  r(  s       r1   _prune_headszBertModel._prune_headsf  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr3   Nr   r   ry   rv   r   r   r   rH  r   r[  r   r\  r]  r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }| j         j        r|
|
n| j         j        }
nd}
||t          d          |+|                     ||           |                                }n.||                                d d         }nt          d          |\  }}||j	        n|j	        }d}|	Bt          |	t                    s|	d         d         j        d         n|	                                }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t%          j        |t$          j        |          }|                     |||||	          }|t%          j        |||z   f|
          }| j        dk    o| j        dk    o|d u o| }|rO|                                dk    r7| j         j        rt3          ||||          }n.t5          ||j        |          }n|                     ||          }| j         j        r~|||                                \  }}}||f}|t%          j        ||
          }|r0|                                dk    rt5          ||j        |          }n|                     |          }nd }|                     || j         j                  }|                      ||||||	|
||||          }|d         }| j!        | !                    |          nd } |s|| f|dd          z   S tE          || |j#        |j$        |j%        |j&                  S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerw   z5You have to specify either input_ids or inputs_embedsr   r   ry   r   )r   rv   ry   r   r   )r   r  ru   r=   )r  )
r   r   r   rH  r   r[  r   r\  r]  r   r$   )ra  pooler_outputr   r   rb  rc  )'r^   r   r\  use_return_dictr   r[  rX   %warn_if_padding_and_no_attention_maskr   r   r   r   rW   get_seq_lengthr   r   ry   r   rZ   r   r   onesr  rt   r   r   r   r{   get_extended_attention_maskinvert_attention_maskget_head_maskrX  r  r  r   r   r   rb  rc  )!r   r   r   ry   rv   r   r   r   rH  r   r[  r   r\  r]  r   r   r   r   r   r   r   r   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputsr  rs  s!                                    r1   r   zBertModel.forwardn  ss   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B];! 	%.%:		@UIII ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"& "/5996"1%+B//$3355 # !t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z??%)'#9 + 
 
 !"ZZBX5X(YbhiiiN $. &,
:&T!& &%	 	! $ 	d(:(:(<(<(A(A {% 
*T"$*	+ +'' +N"$4$:J+ + +'' '+&F&F~Wb&c&c# ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&' e,B,F,F,H,HA,M,M 3V*,<,BJ3 3 3// 372L2LMc2d2d//.2+ &&y$+2OPP	,,2"7#B+/!5#) ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-'+;)7&1,=
 
 
 	
r3   )T)NNNNNNNNNNNNNN)r   r   r   _no_split_modulesr}   r  r  r  r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   r  r  >  s        *;7     &/ / /0 0 0C C C  -11515/3,0048<9=+/$(,0/3&*15S
 S
EL)S
 !.S
 !.	S

 u|,S
 EL)S
  -S
  (5S
 !) 6S
 "%S
 D>S
 $D>S
 'tnS
 d^S
 !.S
  
uU\"$PP	Q!S
 S
 S
 ^S
 S
 S
 S
 S
r3   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         dee         deee	j
                 ef         fd            Z xZS )BertForPreTrainingpredictions.decoder.biascls.predictions.decoder.weightc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r  )r|   r}   r  r  r  clsr  r   s     r1   r}   zBertForPreTraining.__init__  sQ       f%%	'// 	r3   c                 $    | j         j        j        S r  r  r  r}  r  s    r1   get_output_embeddingsz(BertForPreTraining.get_output_embeddings      x#++r3   c                 T    || j         j        _        |j        | j         j        _        d S r  r  r  r}  r9   r   new_embeddingss     r1   set_output_embeddingsz(BertForPreTraining.set_output_embeddings  %    '5$$2$7!!!r3   Nr   r   ry   rv   r   r   labelsnext_sentence_labelr   r\  r]  r   c                 .   ||n| j         j        }|                     |||||||	|
|	  	        }|dd         \  }}|                     ||          \  }}d}||t	                      } ||                    d| j         j                  |                    d                    } ||                    dd          |                    d                    }||z   }|s||f|dd         z   }||f|z   n|S t          ||||j        |j	                  S )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
            pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```
        Nr   ry   rv   r   r   r   r\  r]  r=   rw   )r  r  r  r   rb  )
r^   r  r  r  r   r   r   r  r   rb  )r   r   r   ry   rv   r   r   r  r  r   r\  r]  r/  r  rs  r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr#  s                         r1   r   zBertForPreTraining.forward  sn   V &1%<kk$+B]))))%'/!5#  

 

 *1!&48HH_m4\4\11
"5"A'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN!)*@*E*Eb!*L*LNaNfNfgiNjNj!k!k'*<<J 	R')?@7122;NF/9/EZMF**6Q'/$:!/)
 
 
 	
r3   NNNNNNNNNNN)r   r   r   _tied_weights_keysr}   r  r  r!   r   rZ   r   r   r   r   r  r   r   r   s   @r1   r  r    s        56VW    , , ,8 8 8  -11515/3,004)-6:,0/3&*L
 L
EL)L
 !.L
 !.	L

 u|,L
 EL)L
  -L
 &L
 &el3L
 $D>L
 'tnL
 d^L
 
uU\"$<<	=L
 L
 L
 ^L
 L
 L
 L
 L
r3   r  zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c            $           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         dee         dee         dee         dee	j
                 deee	j
                 ef         f d            Z xZS )BertLMHeadModelzcls.predictions.decoder.biasr  c                    t                                          |           |j        st                              d           t          |d          | _        t          |          | _        | 	                                 d S )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r|   r}   r   rC   warningr  r  r  r  r  r   s     r1   r}   zBertLMHeadModel.__init__v  su         	kNNijjjf>>>	"6** 	r3   c                 $    | j         j        j        S r  r  r  s    r1   r  z%BertLMHeadModel.get_output_embeddings  r  r3   c                 T    || j         j        _        |j        | j         j        _        d S r  r  r  s     r1   r  z%BertLMHeadModel.set_output_embeddings  r  r3   Nr   r   ry   rv   r   r   r   rH  r  r   r[  r   r\  r]  r   r   c                 l   ||n| j         j        }|	d}|                     |||||||||
|||||          }|d         }|                     |          }d}|	 | j        ||	| j         j        fi |}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        NF)r   ry   rv   r   r   r   rH  r   r[  r   r\  r]  r   r   r=   )r  logitsr   r   rb  rc  )r^   r  r  r  loss_functionr   r   r   r   rb  rc  )r   r   r   ry   rv   r   r   r   rH  r  r   r[  r   r\  r]  r   loss_kwargsr/  r  r  lm_lossr#  s                         r1   r   zBertLMHeadModel.forward  s   4 &1%<kk$+B]I))))%'"7#9+/!5#)  
 
" "!* HH_55(d():FDKDZjj^ijjG 	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
r3   )NNNNNNNNNNNNNNN)r   r   r   r  r}   r  r  r!   r   rZ   r   r   r   r   r   r   r   r   r   s   @r1   r  r  n  s        9:Z[
 
 
 
 
, , ,8 8 8  -11515/3,0048<9=)-+/$(,0/3&*15!@
 @
EL)@
 !.@
 !.	@

 u|,@
 EL)@
  -@
  (5@
 !) 6@
 &@
 "%@
 D>@
 $D>@
 'tn@
 d^@
  !.!@
$ 
uU\"$EE	F%@
 @
 @
 ^@
 @
 @
 @
 @
r3   r  c                       e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         dee         deee	j
                 ef         fd            ZddZedefd            Z xZS )BertForMaskedLMr  r  c                    t                                          |           |j        rt                              d           t          |d          | _        t          |          | _        | 	                                 d S )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  r   s     r1   r}   zBertForMaskedLM.__init__  s~        	NN1  
 f>>>	"6** 	r3   c                 $    | j         j        j        S r  r  r  s    r1   r  z%BertForMaskedLM.get_output_embeddings  r  r3   c                 T    || j         j        _        |j        | j         j        _        d S r  r  r  s     r1   r  z%BertForMaskedLM.set_output_embeddings  r  r3   Nr   r   ry   rv   r   r   r   rH  r  r   r\  r]  r   c                    ||n| j         j        }|                     |||||||||
||          }|d         }|                     |          }d}|	Kt	                      } ||                    d| j         j                  |	                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r   ry   rv   r   r   r   rH  r   r\  r]  r   rw   r=   r  r  r   rb  )
r^   r  r  r  r   r   r   r   r   rb  )r   r   r   ry   rv   r   r   r   rH  r  r   r\  r]  r/  r  r  r  r  r#  s                      r1   r   zBertForMaskedLM.forward  s   . &1%<kk$+B]))))%'"7#9/!5#  
 
 "!* HH_55'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r3   c                 f   |j         }|d         }| j        j        t          d          t	          j        ||                    |j         d         df          gd          }t	          j        |df| j        j        t          j        |j	                  }t	          j        ||gd          }||dS )Nr   z.The PAD token should be defined for generationr$   rw   r   r   )r   r   )
rW   r^   r   rX   rZ   cat	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r1   prepare_inputs_for_generationz-BertForMaskedLM.prepare_inputs_for_generation!  s    o*1~ ;#+MNNNNN4L4LnNbcdNeghMi4j4j#kqstttj!1%t{'?uzZcZj
 
 
 Iy+6A>>>	&.IIIr3   c                     dS )z
        Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
        `prepare_inputs_for_generation` method.
        Fr.   )r  s    r1   can_generatezBertForMaskedLM.can_generate1  s	     ur3   )NNNNNNNNNNNNr  )r   r   r   r  r}   r  r  r!   r   rZ   r   r   r   r   r   r   r  classmethodr  r   r   s   @r1   r  r    s       46VW    , , ,8 8 8  -11515/3,0048<9=)-,0/3&*7
 7
EL)7
 !.7
 !.	7

 u|,7
 EL)7
  -7
  (57
 !) 67
 &7
 $D>7
 'tn7
 d^7
 
uU\"N2	37
 7
 7
 ^7
rJ J J J  T    [    r3   r  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ej                 ef         fd            Z xZS )BertForNextSentencePredictionc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r  )r|   r}   r  r  r  r  r  r   s     r1   r}   z&BertForNextSentencePrediction.__init__@  sQ       f%%	"6** 	r3   Nr   r   ry   rv   r   r   r  r   r\  r]  r   c                    d|v r/t          j        dt                     |                    d          }|
|
n| j        j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|At                      } ||	                    dd          |	                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r$   rw   r=   r  )warningswarnFutureWarningpopr^   r  r  r  r   r   r   r   rb  )r   r   r   ry   rv   r   r   r  r   r\  r]  kwargsr/  rs  seq_relationship_scoresr  r  r#  s                     r1   r   z%BertForNextSentencePrediction.forwardI  sI   T !F**M%  
 ZZ 566F%0%<kk$+B]))))%'/!5#  

 

  
"&((="9"9!'))H!)*A*F*Fr1*M*Mv{{[]!_!_ 	b-/'!""+=F7I7U')F22[aa*#*!/)	
 
 
 	
r3   
NNNNNNNNNN)r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r
  r
  :  sL             -11515/3,004)-,0/3&*Q
 Q
EL)Q
 !.Q
 !.	Q

 u|,Q
 EL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\"$??	@Q
 Q
 Q
 ^Q
 Q
 Q
 Q
 Q
r3   r
  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ej                 ef         fd            Z xZS )BertForSequenceClassificationc                 d   t                                          |           |j        | _        || _        t	          |          | _        |j        |j        n|j        }t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S r  )r|   r}   
num_labelsr^   r  r  classifier_dropoutr   r   r   r   r   r   r<   r  r   r^   r  r   s      r1   r}   z&BertForSequenceClassification.__init__  s        +f%%	)/)B)NF%%TZTn 	 z"455)F$68IJJ 	r3   Nr   r   ry   rv   r   r   r  r   r\  r]  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t!          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r$   
regressionsingle_label_classificationmulti_label_classificationrw   r=   r  )r^   r  r  r   r<   problem_typer  r{   rZ   r   rU   r	   squeezer   r   r   r   r   rb  )r   r   r   ry   rv   r   r   r  r   r\  r]  r/  rs  r  r  r  r#  s                    r1   r   z%BertForSequenceClassification.forward  s   ( &1%<kk$+B]))))%'/!5#  

 

  
]33//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r3   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r  r    sL             -11515/3,004)-,0/3&*E
 E
EL)E
 !.E
 !.	E

 u|,E
 EL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\"$<<	=E
 E
 E
 ^E
 E
 E
 E
 E
r3   r  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ej                 ef         fd            Z xZS )BertForMultipleChoicec                 4   t                                          |           t          |          | _        |j        |j        n|j        }t          j        |          | _        t          j	        |j
        d          | _        |                                  d S )Nr$   )r|   r}   r  r  r  r   r   r   r   r   r   r<   r  r  s      r1   r}   zBertForMultipleChoice.__init__  s       f%%	)/)B)NF%%TZTn 	 z"455)F$6:: 	r3   Nr   r   ry   rv   r   r   r  r   r\  r]  r   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr$   rw   r   r  r=   r  )r^   r  rW   r   r   r  r   r<   r   r   r   rb  )r   r   r   ry   rv   r   r   r  r   r\  r]  num_choicesr/  rs  r  reshaped_logitsr  r  r#  s                      r1   r   zBertForMultipleChoice.forward  s+   X &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ))))%'/!5#  

 

  
]33// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r3   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r!  r!    sL             -11515/3,004)-,0/3&*X
 X
EL)X
 !.X
 !.	X

 u|,X
 EL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\"$==	>X
 X
 X
 ^X
 X
 X
 X
 X
r3   r!  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ej                 ef         fd            Z xZS )BertForTokenClassificationc                 Z   t                                          |           |j        | _        t          |d          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        |j                  | _        |                                  d S NFr  )r|   r}   r  r  r  r  r   r   r   r   r   r   r<   r  r  s      r1   r}   z#BertForTokenClassification.__init__j  s        +f>>>	)/)B)NF%%TZTn 	 z"455)F$68IJJ 	r3   Nr   r   ry   rv   r   r   r  r   r\  r]  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rw   r=   r  )r^   r  r  r   r<   r   r   r  r   r   rb  )r   r   r   ry   rv   r   r   r  r   r\  r]  r/  r  r  r  r  r#  s                    r1   r   z"BertForTokenClassification.forwardx  s   $ &1%<kk$+B]))))%'/!5#  

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r3   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r'  r'  h  s8             -11515/3,004)-,0/3&*2
 2
EL)2
 !.2
 !.	2

 u|,2
 EL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\"$99	:2
 2
 2
 ^2
 2
 2
 2
 2
r3   r'  c                   x    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee         dee         dee         de	e
ej                 ef         fd            Z xZS )BertForQuestionAnsweringc                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S r)  )
r|   r}   r  r  r  r   r   r   
qa_outputsr  r   s     r1   r}   z!BertForQuestionAnswering.__init__  sj        +f>>>	)F$68IJJ 	r3   Nr   r   ry   rv   r   r   start_positionsend_positionsr   r\  r]  r   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nr  r   r$   rw   r   )ignore_indexr=   )r  start_logits
end_logitsr   rb  )r^   r  r  r.  rN   r  r   rT   r   clampr   r   r   rb  )r   r   r   ry   rv   r   r   r/  r0  r   r\  r]  r/  r  r  r3  r4  r  ignored_indexr  
start_lossend_lossr#  s                          r1   r   z BertForQuestionAnswering.forward  s    &1%<kk$+B]))))%'/!5#  

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r3   r  )r   r   r   r}   r!   r   rZ   r   r   r   r   r   r   r   r   s   @r1   r,  r,    sL             -11515/3,0042604,0/3&*>
 >
EL)>
 !.>
 !.	>

 u|,>
 EL)>
  ->
 "%,/>
  ->
 $D>>
 'tn>
 d^>
 
uU\"$@@	A>
 >
 >
 ^>
 >
 >
 >
 >
r3   r,  )r  r!  r
  r  r,  r  r'  r>  r  r  r  rm   )Ur   r   rE   r  dataclassesr   typingr   r   rZ   r   torch.nnr   r   r	   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr    r!   r"   utils.deprecationr#   configuration_bertr%   
get_loggerr   rC   rm   Modulero   r   r   r  r!  r  r1  r:  r>  rQ  rm  ru  rz  r  r  r  r  r  r  r  r  r  r
  r  r!  r'  r,  __all__r.   r3   r1   <module>rJ     s0       				  ! ! ! ! ! ! " " " " " " " "        A A A A A A A A A A ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) w w w w w w w w 9 9 9 9 9 9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 . - - - - - l l l l l l l l l l 9 9 9 9 9 9 9 9 9 9 0 0 0 0 0 0 * * * * * * 
	H	%	%F F FR= = = = =RY = = =@B. B. B. B. B.	 B. B. B.Je! e! e! e! e!- e! e! e!P    RY    !  3 3 3 3 3BI 3 3 3l    ry          C C C C C* C C CLW
 W
 W
 W
 W
") W
 W
 W
t           ")   "    29   .! ! ! ! !bi ! ! !& & & & &bi & & &	9 	9 	9 	9 	929 	9 	9 	9 % % % % %/ % % %4   
: : : : :{ : :  :& 	  x
 x
 x
 x
 x
# x
 x
 x
v   `
 `
 `
 `
 `
, `
 `
 `
F   
W
 W
 W
 W
 W
)? W
 W
 
W
t i i i i i) i i iX   
\
 \
 \
 \
 \
$7 \
 \
 
\
~   V
 V
 V
 V
 V
$7 V
 V
 V
r g
 g
 g
 g
 g
/ g
 g
 g
T B
 B
 B
 B
 B
!4 B
 B
 B
J J
 J
 J
 J
 J
2 J
 J
 J
Z  r3   