
     `iQ                        d Z ddlZddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%  e#j&        e'          Z(d Z) G d de	j*                  Z+ G d de	j*                  Z, G d de,          Z-e,e-dZ. G d de	j*                  Z/ G d de	j*                  Z0 G d de	j*                  Z1e" G d de                      Z2e e"d !           G d" d#e!                                  Z3e" G d$ d%e2                      Z4 e"d&!           G d' d(e2                      Z5 G d) d*e	j*                  Z6 G d+ d,e	j*                  Z7e" G d- d.e2                      Z8 e"d/!           G d0 d1e2                      Z9e" G d2 d3e2                      Z:e" G d4 d5e2                      Z;e" G d6 d7e2                      Z<g d8Z=dS )9zPyTorch ALBERT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )AlbertConfigc           	      B   	 ddl }ddl}ddl}n)# t          $ r t                              d            w xY wt          j                            |          }t          	                    d|            |j
                            |          }g }g }	|D ]j\  }
}t          	                    d|
 d|            |j
                            ||
          }|                    |
           |	                    |           kt          ||	          D ]\  }
}t          |
           t          ||	          D ]i\  }
}|
}|
                    dd          }
|
                    d	d
          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    dd          }
|
                    d d!          }
|
                    d"d#          }
|
                    d$d%          }
t#          |
                    d                    d&k    rd'|
v sd(|
v rd)|
z   }
d*|
v r,|
                    d+d,          }
|
                    d-d.          }
|
                    d          }
d/|
v sd0|
v sd1|
v sd2|
v sd3|
v r2t          	                    d4d                    |
                      R| }|
D ]H}|                    d5|          r|                    d6|          }n|g}|d         d7k    s|d         d8k    rt+          |d.          }n|d         d'k    s|d         d9k    rt+          |d:          }n|d         d(k    rt+          |d.          }nv|d         d;k    rt+          |d<          }nY	 t+          ||d                   }nA# t,          $ r4 t          	                    d4d                    |
                      Y w xY wt#          |          d=k    rt/          |d&                   }||         }J|d>d         d?k    rt+          |d.          }n|d7k    r|                    |          }	 |j        |j        k    r t5          d@|j         dA|j         dB          n/# t4          $ r"}|xj        |j        |j        fz  c_         d}~ww xY wt          dC|
 dD|            t9          j        |          |_        k| S )Ez'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/ ffn_1ffnzbert/zalbert/attention_1	attentionz
transform/LayerNorm_1full_layer_layer_norm	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output//z/self/zpooler/densepoolerzcls/predictionspredictionszpredictions/attentionzembeddings/attention
embeddingsinner_group_zalbert_layers/group_zalbert_layer_groups/r   output_biasoutput_weightszclassifier/seq_relationshipzseq_relationship/output_zsop_classifier/classifier/weightsweightadam_madam_vAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammabetabiassquad
classifier   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipprintreplacelensplitjoin	fullmatchgetattrAttributeErrorint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr>   nptftf_path	init_varsnamesarraysnamerW   arrayoriginal_namepointerm_namescope_namesnumes                      ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/albert/modeling_albert.pyload_tf_weights_in_albertro   2   s3   
			   Q	
 	
 	
 	 goo011G
KKBBBCCC''00IEF   eBBB5BBCCC&&w55Te5&))  ed5&)) ]/ ]/e ||Ir** ||GU++||GY//||M;77||L"--||M+BCC||K)>??||NB// ||1266||<mLL ||J,,||Hc** ||NH55 ||-}==||3]CC ||2LAA||N,<==||H&<== tzz#1$$-4*?*?CSW[C[C[ 4'D %%<< :<XYYD<<	844Dzz# 4)T11+t33$$KK4CHHTNN44555 	' 	'F||,f55 ' hhy&99%h1~))[^w-F-F!'844Q=00KNf4L4L!'622Q#333!'844Q7**!'<88%g{1~>>GG%   KK <CHHTNN < <===H ;1$$+a.))!#,#$$<=((gx00GGxLL''E	}++ !j'-!j!jRWR]!j!j!jkkk , 	 	 	FFw}ek22FF	 	F4FF}FFGGG'..Ls2    &5
Q!!:RR0U  
U,
U''U,c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                 deej                 deej                 d	eej	                 d
e
dej        fdZ xZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    r^   c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          d           t+          |dd          | _        |                     d	t%          j        | j                                        t$          j        
          d           d S )N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr%   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrZ   arangeexpandrS   ry   zerosrv   sizelongselfr^   	__class__s     rn   r   zAlbertEmbeddings.__init__   sM   !|F,=v?Tbhbuvvv#%<0NPVPe#f#f %'\&2H&J_%`%`" f&;AVWWWz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
    Nr   	input_idsr{   rv   inputs_embedspast_key_values_lengthreturnc                    ||                                 }n|                                 d d         }|d         }|| j        d d |||z   f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nrw   r   r{   r   r}   devicerz   )r   rv   hasattrr{   r   rZ   r   r   r   r   r   ry   r   r%   r   )r   r   r{   rv   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r)   r   s                rn   forwardzAlbertEmbeddings.forward   sm     #..**KK',,..ss3K ^
,QQQ0FVlIl0l-lmL
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r   )NNNNr   )__name__
__module____qualname____doc__r   r   r   rZ   
LongTensorFloatTensorrU   Tensorr   __classcell__r   s   @rn   rq   rq      s         
| 
 
 
 
 
 
. 15593759&'' 'E,-' !!12' u/0	'
   12' !$' 
' ' ' ' ' ' ' 'r   rq   c                        e Zd Zdef fdZdee         ddfdZ	 	 	 ddej	        d	e
ej                 d
e
ej                 dedeeej	                 eej	        ej	        f         f         f
dZ xZS )AlbertAttentionr^   c                    t                                                       |j        |j        z  dk    r/t	          |d          st          d|j         d|j                   |j        | _        |j        | _        |j        |j        z  | _        | j        | j        z  | _        t          j	        |j        | j                  | _
        t          j	        |j        | j                  | _        t          j	        |j        | j                  | _        t          j        |j                  | _        t          j        |j                  | _        t          j	        |j        |j                  | _        t          j        |j        |j                  | _        t+                      | _        t/          |dd          | _        | j        dk    s| j        d	k    r8|j        | _        t          j        d
|j        z  dz
  | j                  | _        d S d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads (rt   ry   rz   relative_keyrelative_key_queryr<   r   )r~   r   hidden_sizenum_attention_headsr   rX   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattention_dropoutr   output_dropoutdenser%   r   setpruned_headsrS   ry   r   r   distance_embeddingr   s     rn   r   zAlbertAttention.__init__   s    ::a??PVXhHiHi?7F$6 7 7 47 7  
 $*#= !-#)#59S#S !58PPYv143EFF
9V/1CDDYv143EFF
!#F,O!P!P j)CDDYv163EFF
f&8f>STTTEE'.v7PR\']']$'>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD### >r=qr   headsr   Nc                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S )Nr   r   dim)rO   r   r   r   r   r   r   r   r   r   r   union)r   r   indexs      rn   prune_headszAlbertAttention.prune_heads  s    u::??F74+T-EtGX
 
u
 (
E::
%dh66'
E::
'
EqAAA
 $(#;c%jj#H !58PP -33E::r   Fhidden_statesattention_mask	head_maskoutput_attentionsc                    |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    |d| j        | j                                      dd          }|	                    |d| j        | j                                      dd          }	|
                    |d| j        | j                                      dd          }
t          j	        ||	                    dd                    }|t          j        | j                  z  }|||z   }| j        dk    s| j        dk    r4|                                d         }t          j        |t          j        |j                                      dd          }t          j        |t          j        |j                                      dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt          j        d	||          }||z   }n?| j        dk    r4t          j        d	||          }t          j        d
|	|          }||z   |z   }t,          j                            |d          }|                     |          }|||z  }t          j	        ||
          }|                    dd                              d          }|                     |          }|                     |          }|                     ||z             }|r||fn|fS )Nrw   r   r<   r   r   r   r|   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )rW   r   r   r   viewr   r   rV   rZ   matmulmathsqrtry   r   r   r   r   r   r   tor}   einsumr   
functionalsoftmaxr   flattenr   r   r%   )r   r   r   r   r   
batch_sizer   _query_layer	key_layervalue_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layerprojected_context_layerprojected_context_layer_dropoutlayernormed_context_layers                           rn   r   zAlbertAttention.forward#  sR    %2$7!
Jjj//HH]++	jj//!&&z2t7OQUQijjttq
 
 NN:r43KTMeffppqrtuvv	!&&z2t7OQUQijjttq
 

 !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@'>99T=Y]q=q=q&++--a0J"\*EJ}OcdddiijlnoppN"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s  -//0@b/II 00AA  -	9O_kBB%//155==a@@"&**]";";*.*=*=>U*V*V'$(NN=Cb3b$c$c!?Pr)?;;WpVrrr   NNF)r   r   r   r   r   listrU   r   rZ   r   r   r   boolr   tupler   r   r   s   @rn   r   r      s       u| u u u u u u8;c ;t ; ; ; ;* 7;15"'<s <s|<s !!23<s E-.	<s
  <s 
uU\"E%,*D$EE	F<s <s <s <s <s <s <s <sr   r   c                        e Zd Z fdZ	 	 	 d
dej        deej                 deej                 dede	e
ej                 e
ej        ej        f         f         f
 fd	Z xZS )AlbertSdpaAttentionc                 b    t                                          |           |j        | _        d S N)r~   r   r   dropout_probr   s     rn   r   zAlbertSdpaAttention.__init__c  s,       "?r   NFr   r   r   r   r   c                    | j         dk    s|r>t                              d           t                                          |||          S |                                \  }}}|                     |                              |d| j        | j	                  
                    dd          }|                     |                              |d| j        | j	                  
                    dd          }	|                     |                              |d| j        | j	                  
                    dd          }
t          j        j                            ||	|
|| j        r| j        ndd	          }|
                    dd          }|                    ||| j                  }|                     |          }|                     |          }|                     ||z             }|fS )
Nrz   a  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   rw   r   r<           F)r   r   r   	attn_mask	dropout_p	is_causal)ry   rB   warningr~   r   r   r   r   r   r   rV   r   r   rZ   r   r   scaled_dot_product_attentiontrainingr   reshaper   r   r   r%   )r   r   r   r   r   r   seq_lenr   r   r   r   attention_outputr   r   r   r   s                  rn   r   zAlbertSdpaAttention.forwardg  s    ':559J5NNH   77??=.Te?fff!.!3!3!5!5
GQJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 !8.KK$+/=Ad''c L 
 
 ,55a;;+33JI[\\"&**-=">">*.*=*=>U*V*V'$(NN=Cb3b$c$c!)++r   r   )r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @rn   r   r   b  s        @ @ @ @ @ 7;15"'1, 1,|1, !!231, E-.	1,
  1, 
uU\"E%,*D$EE	F1, 1, 1, 1, 1, 1, 1, 1, 1, 1,r   r   )eagersdpac                        e Zd Zdef fdZ	 	 	 	 ddej        deej                 deej                 de	d	e	d
e
ej        ej        f         fdZdej        d
ej        fdZ xZS )AlbertLayerr^   c                    t                                                       || _        |j        | _        d| _        t          j        |j        |j                  | _	        t          |j                 |          | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t"          |j                 | _        t          j        |j                  | _        d S )Nr   rt   )r~   r   r^   chunk_size_feed_forwardseq_len_dimr   r%   r   r   r$   ALBERT_ATTENTION_CLASSES_attn_implementationr"   r   intermediate_sizer    
ffn_outputr   
hidden_act
activationr   r   r   r   s     rn   r   zAlbertLayer.__init__  s    '-'E$%'\&2D&J_%`%`%`"1&2MNvVV9V/1IJJ)F$<f>PQQ !23z&"<==r   NFr   r   r   r   output_hidden_statesr   c                     |                      ||||          }t          | j        | j        | j        |d                   }|                     ||d         z             }|f|dd          z   S )Nr   r   )r"   r   ff_chunkr  r  r$   )r   r   r   r   r   r
  r   r  s           rn   r   zAlbertLayer.forward  s~      >>-Teff.M(Q	
 

 22:@PQR@S3STT"2122"666r   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r    r	  r  )r   r   r  s      rn   r  zAlbertLayer.ff_chunk  s<    XX.//
__Z00
__Z00
r   NNFF)r   r   r   r   r   rZ   r   r   r   r   r   r   r  r   r   s   @rn   r   r     s        >| > > > > > >  7;15"'%*7 7|7 !!237 E-.	7
  7 #7 
u|U\)	*7 7 7 7( %,        r   r   c                        e Zd Zdef fdZ	 	 	 	 ddej        deej                 deej                 de	d	e	d
e
eej        e
ej                 f         df         fdZ xZS )AlbertLayerGroupr^   c                     t                                                       t          j        fdt	          j                  D                       | _        d S )Nc                 .    g | ]}t                    S  )r   .0r   r^   s     rn   
<listcomp>z-AlbertLayerGroup.__init__.<locals>.<listcomp>  s!    +g+g+gAK,?,?+g+g+gr   )r~   r   r   
ModuleListrangeinner_group_numalbert_layersr   s    `rn   r   zAlbertLayerGroup.__init__  sR    ]+g+g+g+gvOeIfIf+g+g+ghhr   NFr   r   r   r   r
  r   .c                     d}d}t          | j                  D ]7\  }}	 |	||||         |          }
|
d         }|r||
d         fz   }|r||fz   }8|f}|r||fz   }|r||fz   }|S )Nr  r   r   )	enumerater  )r   r   r   r   r   r
  layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputoutputss               rn   r   zAlbertLayerGroup.forward  s     !)243E)F)F 	M 	M%K'<~yQ\G]_pqqL(OM  I#3|A6H#H # M&9]<L&L# " 	7!4 66G 	4!1 33Gr   r  )r   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @rn   r  r    s        i| i i i i i i 7;15"'%* | !!23 E-.	
   # 
uU\5#667<	=       r   r  c                        e Zd Zdef fdZ	 	 	 	 	 ddej        deej                 deej                 d	e	d
e	de	de
eef         fdZ xZS )AlbertTransformerr^   c                    t                                                       | _        t          j        j        j                  | _        t          j        fdt          j
                  D                       | _        d S )Nc                 .    g | ]}t                    S r  )r  r  s     rn   r  z.AlbertTransformer.__init__.<locals>.<listcomp>  s"    1t1t1tq2B62J2J1t1t1tr   )r~   r   r^   r   r   r   r   embedding_hidden_mapping_inr  r  num_hidden_groupsalbert_layer_groupsr   s    `rn   r   zAlbertTransformer.__init__  su    +-9V5JFL^+_+_(#%=1t1t1t1tTYZ`ZrTsTs1t1t1t#u#u   r   NFTr   r   r   r   r
  return_dictr   c           	      4   |                      |          }|r|fnd }|rdnd }|d g| j        j        z  n|}t          | j        j                  D ]}	t	          | j        j        | j        j        z            }
t	          |	| j        j        | j        j        z  z            } | j        |         |||||
z  |dz   |
z           ||          }|d         }|r||d         z   }|r||fz   }|st          d |||fD                       S t          |||          S )Nr  r   r   rw   c              3      K   | ]}||V  	d S r   r  )r  vs     rn   	<genexpr>z,AlbertTransformer.forward.<locals>.<genexpr>  s(      hhqZ[ZgZgZgZgZghhr   )last_hidden_stater   
attentions)	r'  r^   num_hidden_layersr  rU   r(  r)  r   r   )r   r   r   r   r   r
  r*  all_hidden_statesall_attentionsilayers_per_group	group_idxlayer_group_outputs                rn   r   zAlbertTransformer.forward  s    88GG0DN],,$0:d>G>OTFT[:::U^	t{455 	I 	IA"4;#@4;C`#`aa A!>A^!^_``I!D!9)!D)&66)a-K[9[[\!$" " /q1M  I!/2DR2H!H# I$58H$H! 	ihh]4E~$Vhhhhhh+;LYg
 
 
 	
r   )NNFFT)r   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   r   s   @rn   r$  r$    s        v| v v v v v v 7;15"'%* *
 *
|*
 !!23*
 E-.	*

  *
 #*
 *
 
%	&*
 *
 *
 *
 *
 *
 *
 *
r   r$  c                   ,    e Zd ZU eed<   eZdZdZd Z	dS )AlbertPreTrainedModelr^   albertTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r |j        j        	                                 dS dS )zInitialize the weights.r   )meanstdN      ?)
isinstancer   r   r0   r\   normal_r^   initializer_ranger9   zero_r   rs   r%   fill_AlbertMLMHead)r   modules     rn   _init_weightsz#AlbertPreTrainedModel._init_weights)  sX   fbi(( 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S))))).. 	%K""$$$$$	% 	%r   N)
r   r   r   r   __annotations__ro   load_tf_weightsbase_model_prefix_supports_sdparF  r  r   rn   r9  r9  "  sA         /O N% % % % %r   r9  z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )AlbertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logits
sop_logitsr   r0  )r   r   r   r   rN  r   rZ   r   rG  rO  rP  r   r   r0  r  r   rn   rM  rM  <  s         	 	 )-D(5$
%,,,59x 12999.2J*+2228<M8E%"345<<<59Ju01299999r   rM  c                       e Zd ZU eed<   dZddedef fdZdej	        fdZ
dej	        dd	fd
Zdeeee         f         dd	fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd            Z xZS )AlbertModelr^   r:  Tadd_pooling_layerc                    t                                          |           || _        t          |          | _        t          |          | _        |r=t          j        |j	        |j	                  | _
        t          j                    | _        nd| _
        d| _        |j        | _        |j        | _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r~   r   r^   rq   r)   r$  encoderr   r   r   r'   Tanhpooler_activationr  attn_implementationry   	post_init)r   r^   rS  r   s      rn   r   zAlbertModel.__init__Z  s    
 	   *622(00 	*)F$68JKKDK%'WYYD""DK%)D"#)#> '-'E$ 	r   r   c                     | j         j        S r   r)   r   r   s    rn   get_input_embeddingsz AlbertModel.get_input_embeddingsq  s    ..r   r   Nc                     || j         _        d S r   r[  )r   r   s     rn   set_input_embeddingsz AlbertModel.set_input_embeddingst  s    */'''r   heads_to_prunec                    |                                 D ]u\  }}t          || j        j        z            }t          ||| j        j        z  z
            }| j        j        |         j        |         j                            |           vdS )a  
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        N)	itemsrU   r^   r  rU  r)  r  r"   r   )r   r`  layerr   r6  inner_group_idxs         rn   _prune_headszAlbertModel._prune_headsw  s     +0022 	t 	tLE5EDK$??@@I!%)dk6Q*Q"QRROL,Y7EoV`llmrssss	t 	tr   r   r   r{   rv   r   r   r   r
  r*  c
                 0   ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        |
|          }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |
t          j        |          }|                     ||||          }| j        dk    o| j        d	k    o|d u o| }|rt%          ||j        |
          }ng|                    d                              d          }|                    | j                  }d|z
  t          j        | j                  j        z  }|                     || j         j                  }|                     ||||||	          }|d         }| j        2|                     |                     |d d df                             nd }|	s||f|dd          z   S t;          |||j        |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerw   z5You have to specify either input_ids or inputs_embeds)r   r{   r   )rv   r{   r   r   rz   )tgt_lenr   r<   r|   r>  )r   r   r
  r*  r   )r/  pooler_outputr   r0  ) r^   r   r
  use_return_dictrX   %warn_if_padding_and_no_attention_maskr   r   rZ   onesr   r)   r{   r   r   r   rX  ry   r   r}   	unsqueezer   finfominget_head_maskr1  rU  r'   rW  r   r   r0  )r   r   r   r{   rv   r   r   r   r
  r*  r   r   r   r   r   r   embedding_outputuse_sdpa_attention_maskextended_attention_maskencoder_outputssequence_outputpooled_outputs                         rn   r   zAlbertModel.forward  s,    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"ZFCCCN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z??L_l + 
 

 $. &,
:&T!& &%	 	  # 	d&I 0 6
' ' '## '5&>&>q&A&A&K&KA&N&N#&=&@&@tz&@&R&R#'*-D'DTXT^H_H_Hc&c#&&y$+2OPP	,,#/!5# ' 
 
 *!,VZVaVm..t{{?111a4;P/Q/QRRRsw 	J#]3oabb6III)-')7&1	
 
 
 	
r   )T)	NNNNNNNNN)r   r   r   r   rG  rI  r   r   r   r   r]  r_  dictrU   r   re  r   r   rZ   r   r   r   r   r   r   r   r   s   @rn   rR  rR  U  s          |       ./bl / / / /0", 04 0 0 0 0t4T#Y+? tD t t t t"  156:59371559,0/3&*T
 T
E,-T
 !!23T
 !!12	T

 u/0T
 E-.T
   12T
 $D>T
 'tnT
 d^T
 
)50	1T
 T
 T
 ^T
 T
 T
 T
 T
r   rR  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       e Zd ZddgZdef fdZdej        fdZdej        ddfd	Z	dej
        fd
Ze	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd            Z xZS )AlbertForPreTrainingpredictions.decoder.biaspredictions.decoder.weightr^   c                     t                                          |           t          |          | _        t	          |          | _        t          |          | _        |                                  d S r   )	r~   r   rR  r:  rD  r(   AlbertSOPHeadsop_classifierrY  r   s     rn   r   zAlbertForPreTraining.__init__  sb       !&))(00+F33 	r   r   c                     | j         j        S r   r(   decoderr\  s    rn   get_output_embeddingsz*AlbertForPreTraining.get_output_embeddings      ''r   new_embeddingsNc                     || j         _        d S r   r  r   r  s     rn   set_output_embeddingsz*AlbertForPreTraining.set_output_embeddings  s    #1   r   c                 $    | j         j        j        S r   r:  r)   r   r\  s    rn   r]  z)AlbertForPreTraining.get_input_embeddings      {%55r   r   r   r{   rv   r   r   labelssentence_order_labelr   r
  r*  c                 P   ||n| j         j        }|                     |||||||	|
|	  	        }|dd         \  }}|                     |          }|                     |          }d}||t                      } ||                    d| j         j                  |                    d                    } ||                    dd          |                    d                    }||z   }|s||f|dd         z   }||f|z   n|S t          ||||j	        |j
                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Nr   r{   rv   r   r   r   r
  r*  r<   rw   )rN  rO  rP  r   r0  )r^   ri  r:  r(   r}  r   r   r   rM  r   r0  )r   r   r   r{   rv   r   r   r  r  r   r
  r*  r"  rt  ru  prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_lossoutputs                         rn   r   zAlbertForPreTraining.forward  sw   R &1%<kk$+B]++))%'/!5#  

 

 *1!& ,,_==((77

"6"B'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN"*(:??2q+A+ACWC\C\]_C`C`"a"a'*==J 	R'4wqrr{BF/9/EZMF**6Q)/!!/)
 
 
 	
r   NNNNNNNNNNN)r   r   r   _tied_weights_keysr   r   r   r   r  r  r   r]  r   r   rZ   r   r   r   r   rM  r   r   r   r   s   @rn   rx  rx    s        56RS|      (ry ( ( ( (2BI 2$ 2 2 2 26bl 6 6 6 6  156:59371559-1;?,0/3&*L
 L
E,-L
 !!23L
 !!12	L

 u/0L
 E-.L
   12L
 )*L
 'u'78L
 $D>L
 'tnL
 d^L
 
)50	1L
 L
 L
 ^L
 L
 L
 L
 L
r   rx  c                   P     e Zd Zdef fdZdej        dej        fdZddZ xZ	S )	rD  r^   c                    t                                                       t          j        |j        |j                  | _        t          j        t          j        |j	                            | _
        t          j        |j        |j                  | _        t          j        |j        |j	                  | _        t          |j                 | _        | j
        | j        _
        d S )Nrt   )r~   r   r   r%   r   r   	ParameterrZ   r   r   r9   r   r   r   r  r   r  r	  r   s     rn   r   zAlbertMLMHead.__init__M  s    f&;AVWWWLV->!?!?@@	Yv163HII
y!68IJJ !23 Ir   r   r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|}|S r   )r   r	  r%   r  )r   r   r  s      rn   r   zAlbertMLMHead.forwardW  sR    

=1166}55]33)  r   Nc                     | j         j        j        j        dk    r| j        | j         _        d S | j         j        | _        d S )Nmeta)r  r9   r   typer\  s    rn   _tie_weightszAlbertMLMHead._tie_weightsa  s<    <#(F22 $	DL )DIIIr   )r   N)
r   r   r   r   r   rZ   r   r   r  r   r   s   @rn   rD  rD  L  s~        &| & & & & & &!U\ !el ! ! ! !* * * * * * * *r   rD  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )r|  r^   c                     t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        d S r   )
r~   r   r   r   classifier_dropout_probr   r   r   
num_labelsr;   r   s     rn   r   zAlbertSOPHead.__init__k  sJ    z&"@AA)F$68IJJr   ru  r   c                 Z    |                      |          }|                     |          }|S r   )r   r;   )r   ru  dropout_pooled_outputlogitss       rn   r   zAlbertSOPHead.forwardq  s+     $] ; ;!677r   )	r   r   r   r   r   rZ   r   r   r   r   s   @rn   r|  r|  j  sq        K| K K K K K KU\ el        r   r|  c                       e Zd ZddgZ fdZdej        fdZdej        ddfdZdej	        fd	Z
e	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd            Z xZS )AlbertForMaskedLMry  rz  c                     t                                          |           t          |d          | _        t	          |          | _        |                                  d S NF)rS  )r~   r   rR  r:  rD  r(   rY  r   s     rn   r   zAlbertForMaskedLM.__init__{  sW       !&EBBB(00 	r   r   c                     | j         j        S r   r  r\  s    rn   r  z'AlbertForMaskedLM.get_output_embeddings  r  r   r  Nc                 @    || j         _        |j        | j         _        d S r   )r(   r  r9   r  s     rn   r  z'AlbertForMaskedLM.set_output_embeddings  s!    #1  . 3r   c                 $    | j         j        j        S r   r  r\  s    rn   r]  z&AlbertForMaskedLM.get_input_embeddings  r  r   r   r   r{   rv   r   r   r  r   r
  r*  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        N	r   r   r{   rv   r   r   r   r
  r*  r   rw   r<   rN  r  r   r0  )
r^   ri  r:  r(   r   r   r   r   r   r0  )r   r   r   r{   rv   r   r   r  r   r
  r*  r"  sequence_outputsr  r  r  r  s                    rn   r   zAlbertForMaskedLM.forward  s   b &1%<kk$+B]++))%'/!5#  

 

 #1: ,,-=>>'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r   
NNNNNNNNNN)r   r   r   r  r   r   r   r  r  r   r]  r   r   rZ   r   r   r   r   r   r   r   r   r   s   @rn   r  r  w  s       46RS    (ry ( ( ( (4BI 4$ 4 4 4 46bl 6 6 6 6  156:59371559-1,0/3&*O
 O
E,-O
 !!23O
 !!12	O

 u/0O
 E-.O
   12O
 )*O
 $D>O
 'tnO
 d^O
 
~u$	%O
 O
 O
 ^O
 O
 O
 O
 O
r   r  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   L    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej                 deej	                 d	eej	                 d
eej                 dee
         dee
         dee
         deeef         fd            Z xZS )AlbertForSequenceClassificationr^   c                 N   t                                          |           |j        | _        || _        t	          |          | _        t          j        |j                  | _	        t          j
        |j        | j        j                  | _        |                                  d S r   )r~   r   r  r^   rR  r:  r   r   r  r   r   r   r;   rY  r   s     rn   r   z(AlbertForSequenceClassification.__init__  s        +!&))z&"@AA)F$68NOO 	r   Nr   r   r{   rv   r   r   r  r   r
  r*  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t!          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationrw   r<   r  )r^   ri  r:  r   r;   problem_typer  r}   rZ   r   rU   r	   squeezer   r   r   r   r   r0  )r   r   r   r{   rv   r   r   r  r   r
  r*  r"  ru  r  rN  r  r  s                    rn   r   z'AlbertForSequenceClassification.forward  s   ( &1%<kk$+B]++))%'/!5#  

 

  
]33//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r   r  )r   r   r   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @rn   r  r    sX       
| 
 
 
 
 
 
  156:59371559-1,0/3&*F
 F
E,-F
 !!23F
 !!12	F

 u/0F
 E-.F
   12F
 )*F
 $D>F
 'tnF
 d^F
 
'.	/F
 F
 F
 ^F
 F
 F
 F
 F
r   r  c                   L    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej                 deej	                 d	eej	                 d
eej                 dee
         dee
         dee
         deeef         fd            Z xZS )AlbertForTokenClassificationr^   c                 d   t                                          |           |j        | _        t          |d          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        | j        j                  | _        |                                  d S r  )r~   r   r  rR  r:  r  r   r   r   r   r   r   r^   r;   rY  )r   r^   r  r   s      rn   r   z%AlbertForTokenClassification.__init__@  s        +!&EBBB -9 **+ 	 
 z"9::)F$68NOO 	r   Nr   r   r{   rv   r   r   r  r   r
  r*  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rw   r<   r  )r^   ri  r:  r   r;   r   r   r  r   r   r0  )r   r   r   r{   rv   r   r   r  r   r
  r*  r"  rt  r  rN  r  r  s                    rn   r   z$AlbertForTokenClassification.forwardP  s   $ &1%<kk$+B]++))%'/!5#  

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r   r  )r   r   r   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   s   @rn   r  r  >  sD       |         156:59371559-1,0/3&*2
 2
E,-2
 !!232
 !!12	2

 u/02
 E-.2
   122
 )*2
 $D>2
 'tn2
 d^2
 
$e+	,2
 2
 2
 ^2
 2
 2
 2
 2
r   r  c                   h    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej                 deej	                 d	eej	                 d
eej                 deej                 dee
         dee
         dee
         deeef         fd            Z xZS )AlbertForQuestionAnsweringr^   c                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S r  )
r~   r   r  rR  r:  r   r   r   
qa_outputsrY  r   s     rn   r   z#AlbertForQuestionAnswering.__init__  sj        +!&EBBB)F$68IJJ 	r   Nr   r   r{   rv   r   r   start_positionsend_positionsr   r
  r*  r   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nr  r   r   rw   r   )ignore_indexr<   )rN  start_logits
end_logitsr   r0  )r^   ri  r:  r  rP   r  
contiguousrO   r   clampr   r   r   r0  )r   r   r   r{   rv   r   r   r  r  r   r
  r*  r"  rt  r  r  r  r  ignored_indexr  
start_lossend_lossr  s                          rn   r   z"AlbertForQuestionAnswering.forward  s    &1%<kk$+B]++))%'/!5#  

 

 "!*#??#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r   r  r   r   r   r   r   r   r   rZ   r   r   r   r   rM  r   r   r   r   s   @rn   r  r    sY       |        156:593715596:48,0/3&*>
 >
E,->
 !!23>
 !!12	>

 u/0>
 E-.>
   12>
 "%"23>
   01>
 $D>>
 'tn>
 d^>
 
)50	1>
 >
 >
 ^>
 >
 >
 >
 >
r   r  c                   L    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej                 deej	                 d	eej	                 d
eej                 dee
         dee
         dee
         deeef         fd            Z xZS )AlbertForMultipleChoicer^   c                    t                                          |           t          |          | _        t	          j        |j                  | _        t	          j        |j	        d          | _
        |                                  d S )Nr   )r~   r   rR  r:  r   r   r  r   r   r   r;   rY  r   s     rn   r   z AlbertForMultipleChoice.__init__  sl       !&))z&"@AA)F$6:: 	r   Nr   r   r{   rv   r   r   r  r   r
  r*  r   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   rw   r   r  r<   r  )r^   ri  rW   r   r   r:  r   r;   r   r   r   r0  )r   r   r   r{   rv   r   r   r  r   r
  r*  num_choicesr"  ru  r  reshaped_logitsrN  r  r  s                      rn   r   zAlbertForMultipleChoice.forward  s+   X &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	
 ++))%'/!5#  

 

  
]33#}== ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r   r  r  r   s   @rn   r  r    sX       |        156:59371559-1,0/3&*W
 W
E,-W
 !!23W
 !!12	W

 u/0W
 E-.W
   12W
 )*W
 $D>W
 'tnW
 d^W
 
)50	1W
 W
 W
 ^W
 W
 W
 W
 W
r   r  )	ro   r9  rR  rx  r  r  r  r  r  )>r   r   rD   dataclassesr   typingr   r   rZ   r   torch.nnr   r   r	   activationsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_albertr   
get_loggerr   rB   ro   Modulerq   r   r   r  r   r  r$  r9  rM  rR  rx  rD  r|  r  r  r  r  r  __all__r  r   rn   <module>r     s      				 ! ! ! ! ! ! " " " " " " " "        A A A A A A A A A A ! ! ! ! ! ! K K K K K K                  . - - - - -         
 : 9 9 9 9 9 9 9 9 9 . . . . . . 
	H	%	%{ { {|A A A A Ary A A AHks ks ks ks ksbi ks ks ks\6, 6, 6, 6, 6,/ 6, 6, 6,t   & & & & &") & & &R         ry      F2
 2
 2
 2
 2
	 2
 2
 2
j % % % % %O % % %2   
: : : : : : :  :& G
 G
 G
 G
 G
' G
 G
 G
T   c
 c
 c
 c
 c
0 c
 c
 c
L* * * * *BI * * *<
 
 
 
 
BI 
 
 
 f
 f
 f
 f
 f
- f
 f
 f
R   T
 T
 T
 T
 T
&; T
 T
 T
n D
 D
 D
 D
 D
#8 D
 D
 D
N J
 J
 J
 J
 J
!6 J
 J
 J
Z c
 c
 c
 c
 c
3 c
 c
 c
L
 
 
r   