
     `i&                       d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+  e'j,        e-          Z. G d dej/                  Z0 G d dej/                  Z1 G d dej/                  Z2 G d dej/                  Z3 G d dej/                  Z4 G d dej/                  Z5 G d dej/                  Z6 G d  d!e          Z7 G d" d#ej/                  Z8 G d$ d%ej/                  Z9e& G d& d'e                       Z: e&d()           G d* d+e:                      Z; e&d,)           G d- d.e:e                      Z<e& G d/ d0e:                      Z= G d1 d2ej/                  Z> e&d3)           G d4 d5e:                      Z?e& G d6 d7e:                      Z@e& G d8 d9e:                      ZA G d: d;ej/                  ZBe& G d< d=e:                      ZCd@d>ZDg d?ZEdS )AzPyTorch X-MOD model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging)deprecate_kwarg   )
XmodConfigc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )XmodEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 l   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           |j        | _        t          j        |j        |j        | j                  | _	        d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r    F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr(   register_buffertorcharangeexpandzerosr*   sizelongr%   selfconfig	__class__s     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/xmod/modeling_xmod.pyr1   zXmodEmbeddings.__init__7   s}   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	

 ".#%<*F,>DL\$
 $
 $
       Nr   c                    |.|t          || j        |          }n|                     |          }||                                }n|                                d d         }|d         }|mt	          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t          j	        | j
        j                  }||                     |          }|                     |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr+   r    r-   r   r/   devicer)   )"create_position_ids_from_input_idsr%   &create_position_ids_from_inputs_embedsrF   hasattrr-   rD   rB   rE   rG   r*   rP   r6   r:   r(   r8   r;   r?   )rI   	input_idsr-   r*   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr:   
embeddingsr8   s                rL   forwardzXmodEmbeddings.forwardP   s{    $A)TM]_uvv#JJ=YY #..**KK',,..ss3K ^

 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
rM   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr+   r    rO   r   )rF   rB   rC   r%   rG   rP   	unsqueezerD   )rI   rU   rW   sequence_lengthr*   s        rL   rR   z5XmodEmbeddings.create_position_ids_from_inputs_embedsx   s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<rM   )NNNNr   )__name__
__module____qualname____doc__r1   r\   rR   __classcell__rK   s   @rL   r#   r#   1   sm         

 
 
 
 
4 rs& & & &P= = = = = = =rM   r#   c                       e Zd Zd fd	Z eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 dee	         dee
         deej                 deej                 fd            Z xZS )XmodSelfAttentionNc                 R   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r6|j        | _        t          j        d
|j        z  dz
  | j                  | _        |j        | _        || _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r(   r)   relative_keyrelative_key_query   r    )r0   r1   r4   num_attention_headsrS   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer=   attention_probs_dropout_probr?   r@   r(   r7   r2   distance_embedding
is_decoder	layer_idxrI   rJ   r(   rz   rK   s       rL   r1   zXmodSelfAttention.__init__   s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD# +"rM   past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionreturnc                    |j         \  }}	}
|                     |          }|                    |d| j        | j                                      dd          }d}|d u}|Ht          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                    |d| j        | j                                      dd          }|                     |          }|                    |d| j        | j                                      dd          }|N|s|nd }|                    ||| j
        d|i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }| j        dk    s| j        d	k    rt|j         d         |j         d         }}|>t'          j        |dz
  t&          j        |j        
                              dd          }n:t'          j        |t&          j        |j        
                              dd          }t'          j        |t&          j        |j        
                              dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt'          j        d||          }||z   }n?| j        d	k    r4t'          j        d||          }t'          j        d||          }||z   |z   }|t?          j         | j                  z  }|||z   }tB          j"        #                    |d          }| $                    |          }|||z  }t'          j        ||          }|%                    dddd          &                                }|'                                d d         | j(        fz   }|                    |          }||fS )Nr+   r    rm   Fr   Trk   rl   rO   r.   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   ))shapert   viewrn   rq   	transpose
isinstancer   
is_updatedgetrz   cross_attention_cacheself_attention_cachelayerskeysvaluesru   rv   updaterB   matmulr(   tensorrG   rP   rC   rx   r7   tor/   einsummathsqrtr   
functionalsoftmaxr?   permute
contiguousrF   rr   )rI   r   r   r   r   r}   r   r   
batch_sizerX   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  rL   r\   zXmodSelfAttention.forward   s    %2$7!
Jjj//!&&z2t7OQUQijjttq
 
 
2$>&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK00I!z2t7OQUQijjtt1 I **^44K%**B 8$:R i1oo  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L*!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCo--rM   NNNNNNFN)r`   ra   rb   r1   r   rB   Tensorr   FloatTensorr   booltupler\   rd   re   s   @rL   rg   rg      s       # # # # # #6 _%0A6RRR 7;15=A+/,115e. e.|e. !!23e. E-.	e.
  ((9:e. "%e. $D>e. !.e. 
u|	e. e. e. SRe. e. e. e. e.rM   rg   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )XmodSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr&   )r0   r1   r   rs   r4   denser;   r<   r=   r>   r?   rH   s     rL   r1   zXmodSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==rM   r   input_tensorr   c                 d    |                      |          }|                     |          }||z   }|S N)r   r?   )rI   r   r   s      rL   r\   zXmodSelfOutput.forward  s4    

=11]33%4rM   r`   ra   rb   r1   rB   r   r\   rd   re   s   @rL   r   r     si        > > > > >U\  RWR^        rM   r   c                       e Zd Zd fd	Zd Z eddd          	 	 	 	 	 	 dd	ej        d
eej	                 deej	                 deej	                 dee
         dee         deej                 deej                 fd            Z xZS )XmodAttentionNc                     t                                                       t          |||          | _        t	          |          | _        t                      | _        |j        | _        d S )Nr(   rz   )	r0   r1   rg   rI   r   outputsetpruned_headspre_normr{   s       rL   r1   zXmodAttention.__init__   sY    %fF]irsss	$V,,EErM   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r    r   )lenr   rI   rn   rq   r   r   rt   ru   rv   r   r   rr   union)rI   headsindexs      rL   prune_headszXmodAttention.prune_heads(  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::rM   r|   r}   r~   r   Fr   r   r   r   r   r   r   c           	         |}| j         r| j                            |          }|                     |||||||          }	|                     |	d         |          }
| j         s| j                            |
          }
|
f|	dd          z   }|S )Nr   r    )r   r   r;   rI   )rI   r   r   r   r   r}   r   r   residualself_outputsattention_outputoutputss               rL   r\   zXmodAttention.forward:  s     != 	A K11-@@Myy!
 
  ;;|AAA} 	G#{445EFF#%QRR(88rM   r   r   )r`   ra   rb   r1   r   r   rB   r   r   r   r   r   r   r\   rd   re   s   @rL   r   r     s       ( ( ( ( ( (; ; ;$ _%0A6RRR 7;15=A+/,115 | !!23 E-.	
  ((9: "% $D> !. 
u|	   SR    rM   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )XmodIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r0   r1   r   rs   r4   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnrH   s     rL   r1   zXmodIntermediate.__init__Z  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$rM   r   r   c                 Z    |                      |          }|                     |          }|S r   )r   r   rI   r   s     rL   r\   zXmodIntermediate.forwardb  s,    

=1100??rM   r   re   s   @rL   r   r   Y  s^        9 9 9 9 9U\ el        rM   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )XmodAdapterc                 ~   t                                                       |j        |j        z  | _        t          j        |j        | j                  | _        t          j        | j        |j                  | _        t          |j
        t                    rt          |j
                 | _        d S |j
        | _        d S r   )r0   r1   r4   adapter_reduction_factorbottleneck_sizer   rs   dense1dense2r   r   r   r
   adapter_act_fnrH   s     rL   r1   zXmodAdapter.__init__i  s    %1V5TTi 2D4HIIi 4f6HIIf'-- 	4"():";D"("3DrM   r   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   r   s     rL   r\   zXmodAdapter.forwards  s=    M22++M::M22rM   r   re   s   @rL   r   r   h  s^        4 4 4 4 4U\ el        rM   r   c                        e Zd Z fdZdej        dej        dej        dej        fdZdej        dej        fdZ xZS )
XmodOutputc                 <   t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        |j	        | _	        t          j
        |j                  | _        |j        r&t          j        |j        |j                  | _        nd | _        |j        | _        t          j        i           | _        |j        D ]&}t%          |          | j        t'          |          <   'd S r   )r0   r1   r   rs   r   r4   r   r;   r<   ln_before_adapterr=   r>   r?   adapter_layer_normadapter_reuse_layer_norm
ModuleDictadapter_modules	languagesr   r   )rI   rJ   languagerK   s      rL   r1   zXmodOutput.__init__{  s    Yv79KLL
f&8f>STTT!'!9z&"<==$ 	+&(l63E6K`&a&a&aD##&*D#(.(G%!}R00( 	F 	FH2=f2E2ED X//	F 	FrM   r   r   lang_idsr   c                     |                      |          }|                     |          }||z   }|                     ||          }|S r   )r   r?   lang_adapter)rI   r   r   r   s       rL   r\   zXmodOutput.forward  sI    

=11]33%4))(MBBrM   c                    t          j        |d          \  }}| j        s|}| j        |                     |          }n| j        r|                     |          }| j        r|}t          j        ||                                d          }g }t          t          ||                    D ]|\  }\  }}	t          | j                                                  t          |                                                   }
|                     | j        |
         |	                     }t          j        |d          }|                     |          }||z  }|S )NT)return_countsr   )rB   unique_consecutiver   r   r   r;   splittolist	enumerateziplistr   r   rp   itemappendcatr?   )rI   r   r   lang_lengthsr   split_hidden_stateslang_wise_outputsilang_idsplit_hidden_statelangs              rL   r   zXmodOutput.lang_adapter  s\   !&!9(RV!W!W!W,% 	%$H". 33MBBMM* 	: NN=99M! 	%$H#k-9L9L9N9NPQRR09#hH[:\:\0]0] 	U 	U,A,+,113344S5H5HID$$%?T%9$%?@R%S%STTTT	"3Q77]33!rM   )	r`   ra   rb   r1   rB   r   r\   r   rd   re   s   @rL   r   r   z  s        F F F F FU\  Y^Ye jojv    U\ %,        rM   r   c                   >    e Zd Zd fd	Z eddd          	 	 	 	 	 	 	 ddej        d	ej        d
eej                 deej                 deej                 deej                 dee	         dee
         deej                 deej                 fd            Zd Z xZS )	XmodLayerNc                    t                                                       |j        | _        d| _        t	          ||          | _        |j        | _        |j        | _        | j        r0| j        st          |  d          t	          |d|          | _	        t          |          | _        t          |          | _        |j        | _        d S )Nr    rz   z> should be used as a decoder model if cross attention is addedr)   r   )r0   r1   chunk_size_feed_forwardseq_len_dimr   	attentionry   add_cross_attentionro   crossattentionr   intermediater   r   r   )rI   rJ   rz   rK   s      rL   r1   zXmodLayer.__init__  s    '-'E$&vCCC +#)#= # 	q? j D!h!h!hiii"/PZfo"p"p"pD,V44 ((rM   r|   r}   r~   r   Fr   r   r   r   r   encoder_attention_maskr   r   r   c
           	         |                      ||||||	          }
|
d         }|
dd          }| j        rV|Tt          | d          st          d|  d          |                     |||||||	          }|d         }||dd          z   }|}| j        r| j                            |          }t          | j	        | j
        | j        |          }|                     |||          }| j        s| j                            |          }|f|z   S )N)r   r   r   r}   r   r   r    r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r}   r   r   )r  ry   rS   ro   r  r   r   r;   r   feed_forward_chunkr  r  )rI   r   r   r   r   r   r  r}   r   r   self_attention_outputsr   r   cross_attention_outputsr   intermediate_outputlayer_outputs                    rL   r\   zXmodLayer.forward  s|    "&)/+) "0 "
 "
 2!4(,? 	<4@4!122  Dd D D D  
 '+&9&9 5#&; /"3- ': ' '#  7q9 7 ;;G#= 	G#{445EFF7#(	
 
 {{#6(KK} 	?;00>>L((rM   c                 ,    |                      |          S r   )r  )rI   r   s     rL   r  zXmodLayer.feed_forward_chunk  s      !1222rM   r   )NNNNNFN)r`   ra   rb   r1   r   rB   r   r   r   r   r   r   r\   r  rd   re   s   @rL   r  r    s:       ( ( ( ( ( ( _%0A6RRR
 7;15=A>B+/,1156) 6)|6) ,6) !!23	6)
 E-.6)  ((9:6) !)): ;6) "%6) $D>6) !.6) 
u|	6) 6) 6) SR6)p3 3 3 3 3 3 3rM   r  c                   T    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 ddej        dej        deej                 deej                 d	eej                 d
eej                 dee         dee	         dee	         dee	         dee	         deej                 de
eej                 ef         fdZ xZS )XmodEncoderc                 D   t                                                       | _        t          j        fdt          j                  D                       | _        j        | _	        | j	        r%t          j
        j        j                  | _
        d| _        d S )Nc                 2    g | ]}t          |           S )r  )r  ).0r  rJ   s     rL   
<listcomp>z(XmodEncoder.__init__.<locals>.<listcomp>  s&    #l#l#lqIf$B$B$B#l#l#lrM   r&   F)r0   r1   rJ   r   
ModuleListrangenum_hidden_layerslayerr   is_pre_normr;   r4   r<   gradient_checkpointingrH   s    `rL   r1   zXmodEncoder.__init__  s    ]#l#l#l#lERXRjLkLk#l#l#lmm
!? 	Y\&*<&BWXXXDN&+###rM   NFTr   r   r   r   r   r  r}   	use_cacher   output_hidden_statesreturn_dictr   r   c                 "   | j         r%| j        r|rt                              d           d}|r8|6t	          t          | j                  t          | j                            }|rCt          |t                    r.t                              d           t	          j	        |          }|
rdnd }|	rdnd }|	r| j        j
        rdnd }t          | j                  D ]Z\  }}|
r||fz   }|||         nd } |||||||||	|	  	        }|d         }|	r$||d         fz   }| j        j
        r||d         fz   }[| j        r|                     |          }|
r||fz   }|st          d	 |||||fD                       S t          |||||
          S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rJ   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`. r   r    rm   c              3      K   | ]}||V  	d S r   r-  )r!  vs     rL   	<genexpr>z&XmodEncoder.forward.<locals>.<genexpr>F  s4       
 
 =  !===
 
rM   )last_hidden_stater}   r   
attentionscross_attentions)r(  trainingloggerwarning_oncer   r   rJ   r   r   from_legacy_cacher  r   r&  r'  r;   r   )rI   r   r   r   r   r   r  r}   r)  r   r*  r+  r   all_hidden_statesall_self_attentionsall_cross_attentionsr  layer_modulelayer_head_masklayer_outputss                       rL   r\   zXmodEncoder.forward  sb    & 	"4= 	" "##p   "	 	v01,dk2R2R2RT`hlhsTtTtTtuuO 	UOU;; 	U\  
 2COTTO"6@BBD$5?bb4%6d4;;Zdrr`d(44 	V 	VOA|# I$58H$H!.7.CillO(L%&!
 
M *!,M  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	: NN=99M 	E 1]4D D 	 
 
 "#%'(
 
 
 
 
 
 9+++*1
 
 
 	
rM   )
NNNNNNFFTN)r`   ra   rb   r1   rB   r   r   r   r   r   r   r   r   r\   rd   re   s   @rL   r  r    sX       , , , , , 7;15=A>B+/$(,1/4&*15T
 T
|T
 ,T
 !!23	T

 E-.T
  ((9:T
 !)): ;T
 "%T
 D>T
 $D>T
 'tnT
 d^T
 !.T
 
uU\"$MM	NT
 T
 T
 T
 T
 T
 T
 T
rM   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )
XmodPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r0   r1   r   rs   r4   r   Tanh
activationrH   s     rL   r1   zXmodPooler.__init__\  sC    Yv163EFF
'))rM   r   r   c                 r    |d d df         }|                      |          }|                     |          }|S Nr   )r   rB  )rI   r   first_token_tensorpooled_outputs       rL   r\   zXmodPooler.forwarda  s@     +111a40

#56666rM   r   re   s   @rL   r?  r?  [  s^        $ $ $ $ $
U\ el        rM   r?  c                   :    e Zd ZU eed<   dZdZd ZdefdZ	d Z
dS )	XmodPreTrainedModelrJ   robertaTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r |j        j        	                                 dS dS )zInitialize the weightsg        )meanstdNg      ?)r   r   rs   weightdatanormal_rJ   initializer_rangebiaszero_r2   r%   r;   fill_
XmodLMHead)rI   modules     rL   _init_weightsz!XmodPreTrainedModel._init_weightsq  sX   fbi(( 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S)))))
++ 	%K""$$$$$	% 	%rM   r   c           	          || j         j        vr.t          |  d| dt          | j         j                             || j         _        dS )z
        Set the default language code for the model. This is used when the language is not specified in the input.

        Args:
            language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
        z does not have an adapter for z. Supported languages: N)rJ   r   ro   r   default_language)rI   r   s     rL   set_default_languagez(XmodPreTrainedModel.set_default_language  sa     4;000uuxuuX\]a]h]rXsXsuu   (0$$$rM   c                    t                               d           | j        j                                        D ]	}d|_        
t                               d           | j        j        j        D ]^}|j        j	        (|j        j	                                        D ]	}d|_        
|j        j
                                        D ]	}d|_        
_dS )z
        Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
        fine-tuned on a downstream task.
        zFreezing embeddingsFzFreezing adaptersN)r5  inforI  r[   
parametersrequires_gradencoderr&  r   r   r   )rI   	parameterr&  s      rL   'freeze_embeddings_and_language_adaptersz;XmodPreTrainedModel.freeze_embeddings_and_language_adapters  s    
 	)***0;;== 	, 	,I&+I##'(((\)/ 	0 	0E|.:!&!@!K!K!M!M 4 4I.3I++"\9DDFF 0 0	*/	''0		0 	0rM   N)r`   ra   rb   r!   __annotations__base_model_prefixsupports_gradient_checkpointingrV  r   rY  r`  r-  rM   rL   rH  rH  j  sg         !&*#% % %$0S 0 0 0 00 0 0 0 0rM   rH  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc            $           e Zd Zd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         dee         dee         dee         dee	j
                 deee	j
                 ef         f d            Z xZS )	XmodModelTc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        | 	                                 dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r0   r1   rJ   r#   r[   r  r^  r?  pooler	post_init)rI   rJ   add_pooling_layerrK   s      rL   r1   zXmodModel.__init__  ss    
 	   (00"6**,=Gj(((4 	rM   c                     | j         j        S r   r[   r6   rI   s    rL   get_input_embeddingszXmodModel.get_input_embeddings  s    ..rM   c                     || j         _        d S r   rl  )rI   rv   s     rL   set_input_embeddingszXmodModel.set_input_embeddings  s    */'''rM   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr^  r&  r  r   )rI   heads_to_pruner&  r   s       rL   _prune_headszXmodModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	CrM   NrT   r   r   r-   r*   r   rU   r   r  r}   r)  r   r*  r+  r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }| j         j        r||n| j         j        }nd}||t          d          |+|                     ||           |                                }n.||                                dd         }nt          d          |\  }}||j	        n|j	        }d}|
Bt          |
t                    s|
d         d         j        d         n|
                                }|| j         j        t          d          t          | j        j        d         j        j                                                  }|                    | j         j                  }|t-          j        ||	          z  }|t-          j        |||z   f|	          }|gt1          | j        d
          r1| j        j        ddd|f         }|                    ||          }|}n!t-          j        |t,          j        |          }|                     ||          }| j         j        rL|J|                                \  }}}||f}|	t-          j        ||	          }	|                     |	          }nd}|                      || j         j!                  }|                     |||||          }|                     |||||||
|||||          } | d         }!| j"        | "                    |!          nd}"|s|!|"f| dd         z   S tG          |!|"| j$        | j%        | j&        | j'                  S )  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer+   z5You have to specify either input_ids or inputs_embedsr   r   zPInput language unknown. Please call `XmodPreTrainedModel.set_default_language()`)rP   r-   rO   )rT   r*   r-   rU   rV   )r   r   r   r   r  r}   r)  r   r*  r+  r   r    )r1  pooler_outputr}   r   r2  r3  )(rJ   r   r*  use_return_dictry   r)  ro   %warn_if_padding_and_no_attention_maskrF   rP   r   r   r   get_seq_lengthrX  r   r^  r&  r   r   r   r   rB   onesrS   r[   r-   rD   rE   rG   get_extended_attention_maskinvert_attention_maskget_head_maskr%  rh  r   r}   r   r2  r3  )#rI   rT   r   r   r-   r*   r   rU   r   r  r}   r)  r   r*  r+  r   rW   r   rX   rP   rV   adapter_languagesdefault_lang_idrY   rZ   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputrF  s#                                      rL   r\   zXmodModel.forward  s   0 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B];! 	%.%:		@UIII ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"& "/5996"1%+B//$3355 # {+3 !sttt $T\%7%:%A%Q%V%V%X%X Y Y/55dk6RSSO&Jv)N)N)NNH!"Z*jCY6Y)ZdjkkkN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z 150P0PQ_al0m0m ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&.2.H.HI_.`.`++.2+ &&y$+2OPP	??%)'#9 + 
 
 ,,2"7#B+/!5#) ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-'+;)7&1,=
 
 
 	
rM   )T)NNNNNNNNNNNNNNN)r`   ra   rb   r1   rn  rp  rt  r   r   rB   r   
LongTensorr   r   r   r   r   r\   rd   re   s   @rL   rf  rf    s             "/ / /0 0 0C C C  -1/31515/3,0048<9=+/$(,0/3&*15!A
 A
EL)A
 5+,A
 !.	A

 !.A
 u|,A
 EL)A
  -A
  (5A
 !) 6A
 "%A
 D>A
 $D>A
 'tnA
 d^A
  !.!A
" 
uU\"$PP	Q#A
 A
 A
 ^A
 A
 A
 A
 A
rM   rf  zQ
    X-MOD Model with a `language modeling` head on top for CLM fine-tuning.
    c            &           e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j                 d
ee	j
                 dee	j
                 dee	j                 dee	j                 dee	j                 dee	j                 dee	j
                 dee         dee         dee         dee         dee         dee	j                 deee	j                 ef         f"d            Z xZS )XmodForCausalLMlm_head.decoder.weightlm_head.decoder.biasc                    t                                          |           |j        st                              d           t          |d          | _        t          |          | _        | 	                                 d S )NzLIf you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`Frj  
r0   r1   ry   r5  warningrf  rI  rT  lm_headri  rH   s     rL   r1   zXmodForCausalLM.__init__`  su         	kNNijjj 5AAA!&)) 	rM   c                     | j         j        S r   r  decoderrm  s    rL   get_output_embeddingsz%XmodForCausalLM.get_output_embeddingsm      |##rM   c                     || j         _        d S r   r  rI   new_embeddingss     rL   set_output_embeddingsz%XmodForCausalLM.set_output_embeddingsq      -rM   NrT   r   r   r-   r*   r   rU   r   r  labelsr}   r)  r   r*  r+  r   r   c                 p   ||n| j         j        }|
d}|                     |||||||||	||||||          }|d         }|                     |          }d}|
 | j        ||
fd| j         j        i|}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
                  S )aS  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
        >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
        >>> config.is_decoder = True
        >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
        >>> model.set_default_language("en_XX")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   r   r-   r*   r   rU   r   r  r}   r)  r   r*  r+  r   r   r3   rm   )losslogitsr}   r   r2  r3  )rJ   rx  rI  r  loss_functionr3   r   r}   r   r2  r3  )rI   rT   r   r   r-   r*   r   rU   r   r  r  r}   r)  r   r*  r+  r   kwargsr   r  prediction_scoreslm_lossr   s                          rL   r\   zXmodForCausalLM.forwardt  s4   ^ &1%<kk$+B]I,,))%'"7#9+/!5#)  
 
$ "!* LL99(d(!   ;1 	 G  	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
rM   )NNNNNNNNNNNNNNNN)r`   ra   rb   _tied_weights_keysr1   r  r  r   r   rB   r  r   r   r   r   r   r   r   r\   rd   re   s   @rL   r  r  W  s        34JK
 
 
 
 
$ $ $. . .  15/36:59371559=A>B-1+/$(,0/3&*15#[
 [
E,-[
 5+,[
 !!23	[

 !!12[
 u/0[
 E-.[
   12[
  ((9:[
 !)): ;[
 )*[
 "%[
 D>[
 $D>[
 'tn[
  d^![
" !.#[
& 
uU\"$EE	F'[
 [
 [
 ^[
 [
 [
 [
 [
rM   r  c                        e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j                 d
ee	j
                 dee	j
                 dee	j                 dee	j                 dee	j                 dee	j                 dee	j
                 dee         dee         dee         deee	j                 ef         fd            Z xZS )XmodForMaskedLMr  r  c                    t                                          |           |j        rt                              d           t          |d          | _        t          |          | _        | 	                                 d S )NzkIf you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  rH   s     rL   r1   zXmodForMaskedLM.__init__  s~        	NN1  
 !5AAA!&)) 	rM   c                     | j         j        S r   r  rm  s    rL   r  z%XmodForMaskedLM.get_output_embeddings  r  rM   c                     || j         _        d S r   r  r  s     rL   r  z%XmodForMaskedLM.set_output_embeddings  r  rM   NrT   r   r   r-   r*   r   rU   r   r  r  r   r*  r+  r   c                    ||n| j         j        }|                     |||||||||	|||          }|d         }|                     |          }d}|
Kt	                      } ||                    d| j         j                  |
                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)r   r   r-   r*   r   rU   r   r  r   r*  r+  r   r+   rm   r  r  r   r2  )
rJ   rx  rI  r  r   r   r3   r   r   r2  )rI   rT   r   r   r-   r*   r   rU   r   r  r  r   r*  r+  r   r  r  masked_lm_lossloss_fctr   s                       rL   r\   zXmodForMaskedLM.forward  s   4 &1%<kk$+B],,))%'"7#9/!5#  
 
 "!* LL99'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
rM   )NNNNNNNNNNNNN)r`   ra   rb   r  r1   r  r  r   r   rB   r  r   r   r   r   r   r   r\   rd   re   s   @rL   r  r    s       24JK     $ $ $. . .  15/36:59371559=A>B-1,0/3&*:
 :
E,-:
 5+,:
 !!23	:

 !!12:
 u/0:
 E-.:
   12:
  ((9::
 !)): ;:
 )*:
 $D>:
 'tn:
 d^:
 
uU\"N2	3:
 :
 :
 ^:
 :
 :
 :
 :
rM   r  c                   .     e Zd ZdZ fdZd Zd Z xZS )rT  z*Roberta Head for masked language modeling.c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j	                  | _
        t          j        t          j        |j	                            | _        | j        | j
        _        d S r   )r0   r1   r   rs   r4   r   r;   r<   
layer_normr3   r  	ParameterrB   rE   rQ  rH   s     rL   r1   zXmodLMHead.__init__1  s    Yv163EFF
,v'9v?TUUUy!3V5FGGLV->!?!?@@	 IrM   c                     |                      |          }t          |          }|                     |          }|                     |          }|S r   )r   r   r  r  rI   featuresr  xs       rL   r\   zXmodLMHead.forward:  sE    JJx  GGOOA LLOOrM   c                     | j         j        j        j        dk    r| j        | j         _        d S | j         j        | _        d S )Nmeta)r  rQ  rP   typerm  s    rL   _tie_weightszXmodLMHead._tie_weightsD  s<     <#(F22 $	DL)DIIIrM   )r`   ra   rb   rc   r1   r\   r  rd   re   s   @rL   rT  rT  .  s\        44& & & & &  * * * * * * *rM   rT  z
    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   x    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         de
eej                 ef         fd            Z xZS )XmodForSequenceClassificationc                     t                                          |           |j        | _        || _        t	          |d          | _        t          |          | _        |                                  d S NFr  )	r0   r1   
num_labelsrJ   rf  rI  XmodClassificationHead
classifierri  rH   s     rL   r1   z&XmodForSequenceClassification.__init__U  sg        + 5AAA088 	rM   NrT   r   r   r-   r*   r   rU   r  r   r*  r+  r   c                    ||n| j         j        }|                     ||||||||	|
|
  
        }|d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|s|f|d	d         z   }||f|z   n|S t          |||j        |j        
          S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	r   r   r-   r*   r   rU   r   r*  r+  r   r    
regressionsingle_label_classificationmulti_label_classificationr+   rm   r  )rJ   rx  rI  r  problem_typer  r/   rB   rG   rp   r   squeezer   r   r   r   r   r2  rI   rT   r   r   r-   r*   r   rU   r  r   r*  r+  r   r  r  r  r  r   s                     rL   r\   z%XmodForSequenceClassification.forward`  s   0 &1%<kk$+B],,))%'/!5#  
 
 "!*11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
rM   NNNNNNNNNNN)r`   ra   rb   r1   r   r   rB   r  r   r   r   r   r   r   r\   rd   re   s   @rL   r  r  M  si       	 	 	 	 	  15/36:59371559-1,0/3&*H
 H
E,-H
 5+,H
 !!23	H

 !!12H
 u/0H
 E-.H
   12H
 )*H
 $D>H
 'tnH
 d^H
 
uU\"$<<	=H
 H
 H
 ^H
 H
 H
 H
 H
rM   r  c                   x    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         de
eej                 ef         fd            Z xZS )XmodForMultipleChoicec                    t                                          |           t          |          | _        t	          j        |j                  | _        t	          j        |j	        d          | _
        |                                  d S )Nr    )r0   r1   rf  rI  r   r=   r>   r?   rs   r4   r  ri  rH   s     rL   r1   zXmodForMultipleChoice.__init__  sl        ((z&"<==)F$6:: 	rM   NrT   r   r-   r   r  r*   r   rU   r   r*  r+  r   c                 @   ||n| j         j        }||j        d         n|j        d         }|)|                    d|                    d                    nd}|>|                    |                    d          |                    d          z            nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
|
  
        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )	a|  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        lang_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr    r+   r   r   )	r   r*   r-   r   r   rU   r   r*  r+  rm   r  )rJ   rx  r   r   rF   repeatrI  r?   r  r   r   r   r2  )rI   rT   r   r-   r   r  r*   r   rU   r   r*  r+  num_choicesflat_input_idsflat_lang_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   rF  r  reshaped_logitsr  r  r   s                             rL   r\   zXmodForMultipleChoice.forward  sj   ` &1%<kk$+B],5,Aioa((}GZ[\G]CLCXINN2,>,>???^bRZRf	q(9(9INN1<M<M(MNNNlpLXLdL--b,2C2CB2G2GHHHjnR`Rln11"n6I6I"6M6MNNNrvR`Rln11"n6I6I"6M6MNNNrv ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ,,"*..,/!5#  
 
  
]33// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
rM   r  )r`   ra   rb   r1   r   r   rB   r  r   r   r   r   r   r   r\   rd   re   s   @rL   r  r    si             15/3596:-1371559,0/3&*]
 ]
E,-]
 5+,]
 !!12	]

 !!23]
 )*]
 u/0]
 E-.]
   12]
 $D>]
 'tn]
 d^]
 
uU\"$==	>]
 ]
 ]
 ^]
 ]
 ]
 ]
 ]
rM   r  c                   x    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         de
eej                 ef         fd            Z xZS )XmodForTokenClassificationc                 Z   t                                          |           |j        | _        t          |d          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        |j                  | _        |                                  d S r  )r0   r1   r  rf  rI  classifier_dropoutr>   r   r=   r?   rs   r4   r  ri  rI   rJ   r  rK   s      rL   r1   z#XmodForTokenClassification.__init__  s        + 5AAA)/)B)NF%%TZTn 	 z"455)F$68IJJ 	rM   NrT   r   r   r-   r*   r   rU   r  r   r*  r+  r   c                    ||n| j         j        }|                     ||||||||	|
|
  
        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )a  
        lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
            that corresponds to `self.config.default_language`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r+   rm   r  )rJ   rx  rI  r?   r  r   r   r  r   r   r2  r  s                     rL   r\   z"XmodForTokenClassification.forward+  s   , &1%<kk$+B],,))%'/!5#  
 
 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
rM   r  )r`   ra   rb   r1   r   r   rB   r  r   r   r   r   r   r   r\   rd   re   s   @rL   r  r    sT             15/36:59371559-1,0/3&*7
 7
E,-7
 5+,7
 !!23	7

 !!127
 u/07
 E-.7
   127
 )*7
 $D>7
 'tn7
 d^7
 
uU\"$99	:7
 7
 7
 ^7
 7
 7
 7
 7
rM   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 4   t                                                       t          j        |j        |j                  | _        |j        |j        n|j        }t          j        |          | _	        t          j        |j        |j
                  | _        d S r   )r0   r1   r   rs   r4   r   r  r>   r=   r?   r  out_projr  s      rL   r1   zXmodClassificationHead.__init__j  s    Yv163EFF
)/)B)NF%%TZTn 	 z"455	&"4f6GHHrM   c                     |d d dd d f         }|                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S rD  )r?   r   rB   tanhr  r  s       rL   r\   zXmodClassificationHead.forwards  sj    QQQ111WLLOOJJqMMJqMMLLOOMM!rM   )r`   ra   rb   rc   r1   r\   rd   re   s   @rL   r  r  g  sR        77I I I I I      rM   r  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         de
eej                 ef         fd            Z xZS )XmodForQuestionAnsweringc                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S r  )
r0   r1   r  rf  rI  r   rs   r4   
qa_outputsri  rH   s     rL   r1   z!XmodForQuestionAnswering.__init__  sj        + 5AAA)F$68IJJ 	rM   NrT   r   r   r-   r*   r   rU   start_positionsend_positionsr   r*  r+  r   c                    ||n| j         j        }|                     ||||||||
||
  
        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||	t          |                                          dk    r|                    d          }t          |	                                          dk    r|	                    d          }	|                    d          }|	                    d|          }|		                    d|          }	t          |          } |||          } |||	          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
rv  Nr  r   r    r+   r   )ignore_indexrm   )r  start_logits
end_logitsr   r2  )rJ   rx  rI  r  r   r  r   r   rF   clampr   r   r   r2  )rI   rT   r   r   r-   r*   r   rU   r  r  r   r*  r+  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                           rL   r\   z XmodForQuestionAnswering.forward  s   * &1%<kk$+B],,))%'/!5#  
 
 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
rM   )NNNNNNNNNNNN)r`   ra   rb   r1   r   r   rB   r  r   r   r   r   r   r   r\   rd   re   s   @rL   r  r  }  s             15/36:593715596:48,0/3&*E
 E
E,-E
 5+,E
 !!23	E

 !!12E
 u/0E
 E-.E
   12E
 "%"23E
   01E
 $D>E
 'tnE
 d^E
 
uU\"$@@	AE
 E
 E
 ^E
 E
 E
 E
 E
rM   r  c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r    r   )nerp   rB   cumsumtype_asrG   )rT   r%   rV   maskincremental_indicess        rL   rQ   rQ     sg     <<$$((**D <!444<<TBBE[[_cc##%%33rM   )r  r  r  r  r  r  rf  rH  )r   )Frc   r   typingr   r   rB   r   torch.nnr   r   r   activationsr
   r   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   utils.deprecationr   configuration_xmodr!   
get_loggerr`   r5  Moduler#   rg   r   r   r   r   r   r  r  r?  rH  rf  r  r  rT  r  r  r  r  r  rQ   __all__r-  rM   rL   <module>r     sI      " " " " " " " "        A A A A A A A A A A ' ' ' ' ' ' ' ' C C C C C C C C C C ) ) ) ) ) ) 9 9 9 9 9 9	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 . - - - - - l l l l l l l l l l , , , , , , , , 0 0 0 0 0 0 * * * * * * 
	H	%	%V= V= V= V= V=RY V= V= V=tB. B. B. B. B.	 B. B. B.J    RY   6 6 6 6 6BI 6 6 6t    ry       ")   $/ / / / / / / /dJ3 J3 J3 J3 J3* J3 J3 J3Z^
 ^
 ^
 ^
 ^
") ^
 ^
 ^
D        30 30 30 30 30/ 30 30 30l   e
 e
 e
 e
 e
# e
 e
 e
P   
t
 t
 t
 t
 t
)? t
 t
 
t
n V
 V
 V
 V
 V
) V
 V
 V
t* * * * * * * *>   V
 V
 V
 V
 V
$7 V
 V
 V
r j
 j
 j
 j
 j
/ j
 j
 j
Z H
 H
 H
 H
 H
!4 H
 H
 H
X    RY   , R
 R
 R
 R
 R
2 R
 R
 R
l4 4 4 4 	 	 	rM   