
     `i                        d Z ddlmZmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej        e          Z G d dej                  Z G d dej                  Z ej!        j"        d             Z#ej!        j"        d             Z$ej!        j"        d             Z%ej!        j"        d             Z&ej!        j"        dej'        de(fd            Z)ej!        j"        dej'        dej'        fd            Z*ej!        j"        dej'        dej'        de(fd            Z+ej!        j"        dej'        dej'        fd            Z, G d dej                  Z- G d  d!ej                  Z. G d" d#ej                  Z/ G d$ d%ej                  Z0 G d& d'ej                  Z1 G d( d)e          Z2 G d* d+ej                  Z3e G d, d-e                      Z4e G d. d/e4                      Z5 G d0 d1ej                  Z6 G d2 d3ej                  Z7 G d4 d5ej                  Z8 G d6 d7ej                  Z9 G d8 d9ej                  Z:e G d: d;e4                      Z; G d< d=ej                  Z< ed>?           G d@ dAe4                      Z=e G dB dCe4                      Z>e G dD dEe4                      Z?g dFZ@dS )GzPyTorch DeBERTa model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )DebertaConfigc                   *     e Zd ZdZd fd	Zd Z xZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).-q=c                    t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        || _	        d S N)
super__init__r   	Parametertorchonesweightzerosbiasvariance_epsilon)selfsizeeps	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/deberta/modeling_deberta.pyr   zDebertaLayerNorm.__init__+   s]    l5:d#3#344LT!2!233	 #    c                 V   |j         }|                                }|                    dd          }||z
                      d                              dd          }||z
  t	          j        || j        z             z  }|                    |          }| j        |z  | j	        z   }|S )NT)keepdim   )
dtypefloatmeanpowr   sqrtr#   tor    r"   )r$   hidden_states
input_typer0   varianceys         r(   forwardzDebertaLayerNorm.forward1   s    "(
%++--!!"d!33!D(--a0055b$5GG&-HtG\<\1]1]]%((44K-'$)3r)   )r   __name__
__module____qualname____doc__r   r8   __classcell__r'   s   @r(   r   r   (   sR        LL$ $ $ $ $ $      r)   r   c                   $     e Zd Z fdZd Z xZS )DebertaSelfOutputc                 
   t                                                       t          j        |j        |j                  | _        t          |j        |j                  | _        t          j	        |j
                  | _        d S r   )r   r   r   Linearhidden_sizedenser   layer_norm_eps	LayerNormDropouthidden_dropout_probdropoutr$   configr'   s     r(   r   zDebertaSelfOutput.__init__=   sa    Yv163EFF
)&*<f>STTz&"<==r)   c                     |                      |          }|                     |          }|                     ||z             }|S r   rE   rJ   rG   r$   r4   input_tensors      r(   r8   zDebertaSelfOutput.forwardC   @    

=11]33}|'CDDr)   r:   r;   r<   r   r8   r>   r?   s   @r(   rA   rA   <   sG        > > > > >      r)   rA   c                    |                      d          }|                     d          }t          j        |t          j        | j                  }t          j        |t          j        |j                  }|dddf         |                    dd                              |d          z
  }|d|ddf         }|                    d          }|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r.   deviceNr   r+   r   )r%   r   arangelongrV   viewrepeat	unsqueeze)query_layer	key_layer
query_sizekey_sizeq_idsk_idsrel_pos_idss          r(   build_relative_positionrc   J   s    $ !!"%%J~~b!!HL5:k>PQQQELI<LMMME4.5::a#4#4#;#;J#J#JJKkzk111n-K''**Kr)   c                     |                      |                    d          |                    d          |                    d          |                    d          g          S )Nr   r   r-   r+   expandr%   )c2p_posr\   relative_poss      r(   c2p_dynamic_expandri   g   sX    >>;++A..0@0@0C0C[EUEUVWEXEXZfZkZklnZoZopqqqr)   c                     |                      |                    d          |                    d          |                    d          |                    d          g          S )Nr   r   rT   re   )rg   r\   r]   s      r(   p2c_dynamic_expandrk   l   sV    >>;++A..0@0@0C0CY^^TVEWEWYbYgYghjYkYklmmmr)   c                     |                      |                                d d         |                     d          |                    d          fz             S )Nr-   rT   re   )	pos_indexp2c_attr]   s      r(   pos_dynamic_expandro   q   sJ    GLLNN2A2.)..2D2DinnUWFXFX1YYZZZr)   r\   scale_factorc                     t          j        t          j        |                     d          t           j                  |z            S )Nr+   r.   )r   r2   tensorr%   r/   )r\   rp   s     r(   scaled_size_sqrtrt   y   s6    :el;#3#3B#7#7u{KKKlZ[[[r)   r]   c                 ~    |                      d          |                     d          k    rt          | |          S |S NrT   )r%   rc   )r\   r]   rh   s      r(   
build_rposrw   ~   s=    y~~b1111&{I>>>r)   max_relative_positionsc           
          t          j        t          t          |                     d          |                    d                    |                    S rv   )r   rs   minmaxr%   )r\   r]   rx   s      r(   compute_attention_spanr|      sA    <C 0 0 4 4innR6H6HIIKabbcccr)   c           	          |                     d          |                     d          k    rK|d d d d d d df                             d          }t          j        | dt	          || |                    S | S )NrT   r   r+   r-   dimindex)r%   r[   r   gatherro   )rn   r\   r]   rh   rm   s        r(   uneven_size_correctedr      s~    y~~b1111 AAAqqq!,66r::	|G2DYPWYb2c2cddddr)   c                        e Zd ZdZ fdZd Z	 	 	 	 ddej        dej        ded	e	ej                 d
e	ej                 de	ej                 de
ej        e	ej                 f         fdZdej        dej        d
ej        dej        def
dZ xZS )DisentangledSelfAttentiona  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                 P   t                                                       |j        |j        z  dk    r t	          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j	        |j        | j        dz  d          | _
        t          j        t          j        | j        t          j                            | _        t          j        t          j        | j        t          j                            | _        |j        |j        ng | _        t%          |d	d          | _        t%          |d
d          | _        | j        rMt          j	        |j        |j        d          | _        t          j	        |j        |j        d          | _        nd | _        d | _        | j        rt%          |dd          | _        | j        dk     r|j        | _        t          j        |j                  | _        d| j        v r&t          j	        |j        | j        d          | _        d| j        v r$t          j	        |j        | j                  | _        t          j        |j                  | _        d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   Fr"   rr   relative_attentiontalking_headrx   r+   r   c2pp2c) r   r   rD   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rC   in_projr   r   r!   r/   q_biasv_biaspos_att_typegetattrr   r   head_logits_projhead_weights_projrx   max_position_embeddingsrH   rI   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probrJ   rK   s     r(   r   z"DisentangledSelfAttention.__init__   so    ::a??8F$6 8 8 48 8 8   $*#= #&v'9F<V'V#W#W !58PPy!3T5G!5KRWXXXl5;0B5;#W#W#WXXl5;0B5;#W#W#WXX393F3RF//XZ")&2F"N"N#FNEBB 	*$&If.H&Jdkp$q$q$qD!%'Yv/I6Kelq%r%r%rD""$(D!%)D"" 		T*1&:RTV*W*WD'*Q...4.L+!z&*DEED))) "	&*<d>PW\ ] ] ])))"$)F,>@R"S"Sz&"EFFr)   c                     |                                 d d         | j        dfz   }|                    |          }|                    dddd          S )Nr+   r   r-   r   r	   )r%   r   rY   permute)r$   xnew_x_shapes      r(   transpose_for_scoresz.DisentangledSelfAttention.transpose_for_scores   sM    ffhhssmt'?&DDFF;yyAq!$$$r)   FNr4   attention_maskoutput_attentionsquery_statesrh   rel_embeddingsreturnc                 $    |E                      |          }                     |                              dd          \  }}	}
n> j         j                             j        dz  d           fdt          d          D             }t          j        |d         |                                	                    |d         j
                            }t          j        |d         |                                	                    |d         j
                            }t          j        |d	         |                                	                    |d	         j
                            } fd
|||fD             \  }}	}
|                      j        ddddf                   z   }|
                      j        ddddf                   z   }
d}dt           j                  z   }t          ||          }||	                    |j
                  z  }t          j        ||	                    dd                    } j        r2|0|.                     |          }                     ||	|||          }|||z   } j        A                     |                    dd	dd                                        dddd	          }|                                }|                    | t          j        |j
                  j                  }t4          j                            |d          }                     |          } j        A                     |                    dd	dd                                        dddd	          }t          j        ||
          }|                    dd	dd                                          }|                                 dd         dz   }|!                    |          }|s|dfS ||fS )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr	   r+   r   r   c                 |    g | ]7t          j        fd t          j                  D             d          8S )c                 ,    g | ]}|d z  z            S )r	    ).0ikwss     r(   
<listcomp>z@DisentangledSelfAttention.forward.<locals>.<listcomp>.<listcomp>   s%    VVVr!a%!)}VVVr)   r   r   )r   catranger   )r   r   r$   r   s    @r(   r   z5DisentangledSelfAttention.forward.<locals>.<listcomp>   sN    qqqcdEIVVVVVeD<T6U6UVVV\]^^^qqqr)   rr   r   r-   c                 :    g | ]}                     |          S r   )r   )r   r   r$   s     r(   r   z5DisentangledSelfAttention.forward.<locals>.<listcomp>   s(    2c2c2cTU43L3LQ3O3O2c2c2cr)   rT   r+   )"r   r   chunkr    r   r   r   matmultr3   r.   r   r   lenr   rt   	transposer   r   disentangled_att_biasr   r   boolmasked_fillfinforz   r   
functionalsoftmaxrJ   r   
contiguousr%   rY   )r$   r4   r   r   r   rh   r   qpr\   r]   value_layerqkvwqr   vrel_attrp   scaleattention_scoresattention_probscontext_layernew_context_layer_shaper   s   `                     @r(   r8   z!DisentangledSelfAttention.forward   s   L m,,B262K2KB2O2O2U2UVW]_2U2`2`/KKK$**4+Ca+GQ*OOBqqqqqhmnohphpqqqDT!Wlnn&6&6&9&9Q&9&N&NOOAT!Wmoo&7&7&:&:a&:&O&OPPAT!Wmoo&7&7&:&:a&:&O&OPPA2c2c2c2cZ[]^`aYb2c2c2c/KK!D$=$=dk$PTVWVWVW->X$Y$YY!D$=$=dk$PTVWVWVW->X$Y$YY3t0111 l;;!EHH;3DH$E$EE <Y5H5HR5P5PQQ" 	u~'AlF^!--n==N00iWegsttG/'9  ,#445E5M5MaQRTUWX5Y5YZZbbcdfgijlmnn',,..+77.8I5;WbWhKiKiKmnn-//0@b/II,,77!-"44_5L5LQPQSTVW5X5XYYaabcefhiklmmO_kBB%--aAq99DDFF"/"4"4"6"6ss";e"C%**+BCC  	)!4((//r)   r\   r]   rp   c           	         |t          |||j                  }|                                dk    r)|                    d                              d          }nj|                                dk    r|                    d          }n<|                                dk    r$t	          d|                                           t          ||| j                  }|                                }|| j        |z
  | j        |z   d d f                             d          }d}d| j        v r| 	                    |          }| 
                    |          }t          j        ||                    dd	                    }	t          j        ||z   d|dz  dz
            }
t          j        |	dt!          |
||          
          }	||	z  }d| j        v r|                     |          }| 
                    |          }|t%          ||          z  }t'          |||          }t          j        | |z   d|dz  dz
            }t          j        ||                    dd	                              |j                            }t          j        |dt-          |||          
                              dd	          }t/          ||||          }||z  }|S )Nr-   r   r	   r      z2Relative position ids must be of dim 2 or 3 or 4. r   r+   rT   r~   r   rr   )rc   rV   r   r[   r   r|   rx   rX   r   r   r   r   r   r   clampr   ri   r   rt   rw   r3   r.   rk   r   )r$   r\   r]   rh   r   rp   att_spanscorepos_key_layerc2p_attrg   pos_query_layerr_posp2c_posrn   s                  r(   r   z/DisentangledSelfAttention.disentangled_att_bias$  s    2;	;K]^^L""'11!44>>qAALL1$$'11!44LL1$$fR^RbRbRdRdffggg)+y$B]^^#((**''(2T5PS[5[[]^]^]^^

)A,, 	  D%%% MM.99M 55mDDMl;0G0GB0O0OPPGk,"91hlQ>NOOGl7:LWVaco:p:pqqqGWE D%%%"oon==O"77HHO/NNNO E
 k5&8"3Q1q8HIIGl9o.G.GB.O.O.R.RYbYh.R.i.ijjGlR'9';PY'Z'Z  iB  ,G[)\ZZGWEr)   FNNN)r:   r;   r<   r=   r   r   r   Tensorr   r   tupler8   r   r   r>   r?   s   @r(   r   r      sG        $G $G $G $G $GL% % % #(/3/315U0 U0|U0 U0  	U0
 u|,U0 u|,U0 !.U0 
u|Xel33	4U0 U0 U0 U0n6\6 <6 l	6
 6 6 6 6 6 6 6 6 6r)   r   c                   *     e Zd ZdZ fdZddZ xZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 t   t                                                       t          |dd          }t          |d|j                  | _        t          j        |j        | j        |          | _        t          |dd          | _	        | j	        sd | _
        n$t          j        |j        | j                  | _
        |j        dk    r%t          j        |j        | j                  | _        nd | _        | j        |j        k    r't          j        | j        |j        d          | _        nd | _        t!          |j        |j                  | _        t          j        |j                  | _        || _        |                     d	t1          j        |j                                      d
          d           d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr   position_ids)r   r+   )
persistent)r   r   r   rD   r   r   	Embedding
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsrC   
embed_projr   rF   rG   rH   rI   rJ   rL   register_bufferr   rW   rf   )r$   rL   r   r'   s      r(   r   zDebertaEmbeddings.__init__`  s   v~q99%f.>@RSS!|F,=t?R`lmmm%,V5Ld%S%S") 	i'+D$$')|F4RTXTg'h'hD$!A%%)+f6LdNa)b)bD&&)-D&&"444 i(;V=OV[\\\DOO"DO)&*<f>STTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r)   Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|+t          j        |t          j        | j        j                  }||                     |          }| j        (|                     |                                          }nt          j        |          }|}	| j	        r|	|z   }	| j
        | 
                    |          }
|	|
z   }	| j        |                     |	          }	|                     |	          }	||                                |	                                k    rU|                                dk    r(|                    d                              d          }|                    d          }|                    |	j                  }|	|z  }	|                     |	          }	|	S )Nr+   r   rU   r   r-   )r%   r   r   r!   rX   rV   r   r   
zeros_liker   r   r   rG   r   squeezer[   r3   r.   rJ   )r$   	input_idstoken_type_idsr   maskinputs_embedsinput_shape
seq_lengthr   
embeddingsr   s              r(   r8   zDebertaEmbeddings.forward  s    #..**KK',,..ss3K ^
,QQQ^<L!"[EJtO`OghhhN  00;;M#/"&":":<;L;L;N;N"O"O"'"2="A"A"
% 	:#&99J%1$($>$>~$N$N!#&;;J?&44J^^J//
xxzzZ^^----88::??<<??22155D~~a((77:+,,D#d*J\\*--
r)   )NNNNNr9   r?   s   @r(   r   r   ]  sR        QQ
 
 
 
 
>, , , , , , , ,r)   r   c                   h     e Zd Z fdZ	 	 	 	 ddedeej        eej                 f         fdZ	 xZ
S )DebertaAttentionc                     t                                                       t          |          | _        t	          |          | _        || _        d S r   )r   r   r   r$   rA   outputrL   rK   s     r(   r   zDebertaAttention.__init__  sB    -f55	'//r)   FNr   r   c                     |                      ||||||          \  }}||}|                     ||          }	|r|	|fS |	d fS )N)r   rh   r   )r$   r   )
r$   r4   r   r   r   rh   r   self_output
att_matrixattention_outputs
             r(   r8   zDebertaAttention.forward  sr     #'))%%) #, #
 #
Z (L;;{LAA 	,$j11$d++r)   r   r:   r;   r<   r   r   r   r   r   r   r8   r>   r?   s   @r(   r   r     s             #(, ,  	, 
u|Xel33	4, , , , , , , ,r)   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )DebertaIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r   r   r   rC   rD   intermediate_sizerE   
isinstance
hidden_actstrr
   intermediate_act_fnrK   s     r(   r   zDebertaIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r)   r4   r   c                 Z    |                      |          }|                     |          }|S r   )rE   r  r$   r4   s     r(   r8   zDebertaIntermediate.forward  s,    

=1100??r)   r:   r;   r<   r   r   r   r8   r>   r?   s   @r(   r   r     s^        9 9 9 9 9U\ el        r)   r   c                   $     e Zd Z fdZd Z xZS )DebertaOutputc                    t                                                       t          j        |j        |j                  | _        t          |j        |j                  | _	        t          j
        |j                  | _        || _        d S r   )r   r   r   rC   r   rD   rE   r   rF   rG   rH   rI   rJ   rL   rK   s     r(   r   zDebertaOutput.__init__  sh    Yv79KLL
)&*<f>STTz&"<==r)   c                     |                      |          }|                     |          }|                     ||z             }|S r   rN   rO   s      r(   r8   zDebertaOutput.forward  rQ   r)   rR   r?   s   @r(   r	  r	    sG                  r)   r	  c                   h     e Zd Z fdZ	 	 	 	 ddedeej        eej                 f         fdZ	 xZ
S )DebertaLayerc                     t                                                       t          |          | _        t	          |          | _        t          |          | _        d S r   )r   r   r   	attentionr   intermediater	  r   rK   s     r(   r   zDebertaLayer.__init__  sK    )&11/77#F++r)   NFr   r   c                     |                      ||||||          \  }}|                     |          }	|                     |	|          }
|r|
|fS |
d fS )Nr   r   rh   r   )r  r  r   )r$   r4   r   r   rh   r   r   r   r   intermediate_outputlayer_outputs              r(   r8   zDebertaLayer.forward  s~     (,~~/%%) (6 (
 (
$* #//0@AA{{#68HII 	( *-- $''r)   )NNNFr   r?   s   @r(   r  r    s        , , , , , "'( (  ( 
u|Xel33	4( ( ( ( ( ( ( (r)   r  c                   r     e Zd ZdZ fdZd Zd ZddZ	 	 	 	 	 dd	ej	        d
ej	        de
de
de
f
dZ xZS )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    t                                                       t          j        fdt	          j                  D                       | _        t          dd          | _        | j        rTt          dd          | _	        | j	        dk     rj
        | _	        t          j        | j	        dz  j                  | _        d| _        d S )Nc                 .    g | ]}t                    S r   )r  )r   _rL   s     r(   r   z+DebertaEncoder.__init__.<locals>.<listcomp>  s!    #b#b#bQL$8$8#b#b#br)   r   Frx   r+   r   r-   )r   r   r   
ModuleListr   num_hidden_layerslayerr   r   rx   r   r   rD   r   gradient_checkpointingrK   s    `r(   r   zDebertaEncoder.__init__  s    ]#b#b#b#b%H`BaBa#b#b#bcc
")&2F"N"N" 	d*1&:RTV*W*WD'*Q...4.L+"$,t/JQ/NPVPb"c"cD&+###r)   c                 0    | j         r| j        j        nd }|S r   )r   r   r    )r$   r   s     r(   get_rel_embeddingz DebertaEncoder.get_rel_embedding  s!    7;7NX,33TXr)   c                 8   |                                 dk    rT|                    d                              d          }||                    d                              d          z  }n-|                                 dk    r|                    d          }|S )Nr-   r   rT   r+   r	   )r   r[   r   )r$   r   extended_attention_masks      r(   get_attention_maskz!DebertaEncoder.get_attention_mask"  s    1$$&4&>&>q&A&A&K&KA&N&N#47N7V7VWY7Z7Z7d7deg7h7hhNN!!Q&&+55a88Nr)   Nc                 ^    | j         r%|#|t          ||          }nt          ||          }|S r   )r   rc   )r$   r4   r   rh   s       r(   get_rel_poszDebertaEncoder.get_rel_pos+  s?    " 	U|';'6|]SS6}mTTr)   TFr4   r   output_hidden_statesr   return_dictc           	         |                      |          }|                     |||          }|r|fnd }|rdnd }	|}
|                                 }t          | j                  D ]0\  }} ||
|||||          \  }}|r||fz   }||}n|}
|r|	|fz   }	1|st          d |||	fD                       S t          |||	          S )Nr   )r   rh   r   r   c              3      K   | ]}||V  	d S r   r   )r   r   s     r(   	<genexpr>z)DebertaEncoder.forward.<locals>.<genexpr>\  s(      hhqZ[ZgZgZgZgZghhr)   last_hidden_stater4   
attentions)r"  r$  r  	enumerater  r   r   )r$   r4   r   r%  r   r   rh   r&  all_hidden_statesall_attentionsnext_kvr   r   layer_moduleatt_ms                  r(   r8   zDebertaEncoder.forward3  s>    00@@''|\RROc;mM;K;Kim0:d//11(44 	; 	;OA|#/<))-"3$ $ $ M5 $ I$58H$H!','  ;!/5(!: 	ihh]4E~$Vhhhhhh+;LYg
 
 
 	
r)   )NN)TFNNT)r:   r;   r<   r=   r   r  r"  r$  r   r   r   r8   r>   r?   s   @r(   r  r    s        BB	, 	, 	, 	, 	,        &*"' ,
 ,
|,
 ,
 #	,

  ,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
r)   r  c                   .    e Zd ZU eed<   dZdgZdZd ZdS )DebertaPreTrainedModelrL   debertar   Tc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j        t          f          r?|j        j                            d           |j        j        	                                 dS t          |t                    r>|j        j        	                                 |j        j        	                                 dS t          |t$          t&          f          r |j        j        	                                 dS dS )zInitialize the weights.g        )r0   stdNg      ?)r  r   rC   r    datanormal_rL   initializer_ranger"   zero_r   r   rG   r   fill_r   r   r   LegacyDebertaLMPredictionHeadDebertaLMPredictionHead)r$   modules     r(   _init_weightsz$DebertaPreTrainedModel._init_weightsi  s   fbi(( 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .-/? @AA 	%M$$S)))K""$$$$$ 9:: 	%M$$&&&M$$&&&&&!>@W XYY 	%K""$$$$$	% 	%r)   N)	r:   r;   r<   r   __annotations__base_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr@  r   r)   r(   r4  r4  b  sF         !*?)@&&*#% % % % %r)   r4  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee         dee         dee         deeef         fd            Z xZS )DebertaModelc                     t                                          |           t          |          | _        t	          |          | _        d| _        || _        |                                  d S Nr   )	r   r   r   r   r  encoderz_stepsrL   	post_initrK   s     r(   r   zDebertaModel.__init__  s]       +F33%f--r)   c                     | j         j        S r   r   r   r$   s    r(   get_input_embeddingsz!DebertaModel.get_input_embeddings  s    ..r)   c                     || j         _        d S r   rM  r$   new_embeddingss     r(   set_input_embeddingsz!DebertaModel.set_input_embeddings  s    *8'''r)   c                      t          d          )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r$   heads_to_prunes     r(   _prune_headszDebertaModel._prune_heads  s    
 ""[\\\r)   Nr   r   r   r   r   r   r%  r&  r   c	           	      <    ||n j         j        }||n j         j        }||n j         j        }||t	          d          |+                     ||           |                                }	n.||                                d d         }	nt	          d          ||j        n|j        }
|t          j	        |	|
          }|!t          j
        |	t          j        |
          }                     |||||          }                     ||d||          }|d	         } j        d	k    r|d
         } fdt           j                  D             }|d         } j                                        } j                            |          } j                            |          }|d	d          D ](} |||d|||          }|                    |           )|d         }|s|f||rd	ndd          z   S t)          ||r|j        nd |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer+   z5You have to specify either input_ids or inputs_embeds)rV   rU   )r   r   r   r   r   T)r%  r   r&  r   rT   c                 4    g | ]}j         j        d          S r   )rI  r  )r   r  r$   s     r(   r   z(DebertaModel.forward.<locals>.<listcomp>  s#    JJJdl(,JJJr)   Fr  r-   r*  )rL   r   r%  use_return_dictr   %warn_if_padding_and_no_attention_maskr%   rV   r   r   r!   rX   r   rI  rJ  r   r  r"  r$  appendr   r4   r,  )r$   r   r   r   r   r   r   r%  r&  r   rV   embedding_outputencoder_outputsencoded_layersr4   layersr   r   rel_posr  sequence_outputs   `                    r(   r8   zDebertaModel.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU%.%:!!@T!"ZFCCCN!"[EJvVVVN??)%' + 
 
 ,,!%/# ' 
 
 )+<!*2.MJJJJeDL6I6IJJJF)"-L!\;;==N!\<<^LLNl../?@@G 	4 	4$u!"&+!-!(#1      %%l3333(, 	^#%>R9YXY8\8\(]]]-;OY/77UY&1
 
 
 	
r)   )NNNNNNNN)r:   r;   r<   r   rO  rS  rW  r   r   r   r   r   r   r   r   r8   r>   r?   s   @r(   rF  rF    sJ           / / /9 9 9] ] ]  -11515/304,0/3&*N
 N
EL)N
 !.N
 !.	N

 u|,N
  -N
 $D>N
 'tnN
 d^N
 
uo%	&N
 N
 N
 ^N
 N
 N
 N
 N
r)   rF  c                   $     e Zd Z fdZd Z xZS )$LegacyDebertaPredictionHeadTransformc                    t                                                       t          |d|j                  | _        t          j        |j        | j                  | _        t          |j	        t                    rt          |j	                 | _        n|j	        | _        t          j        | j        |j                  | _        d S )Nr   )r&   )r   r   r   rD   r   r   rC   rE   r  r  r  r
   transform_act_fnrG   rF   rK   s     r(   r   z-LegacyDebertaPredictionHeadTransform.__init__  s    %f.>@RSSYv143FGG
f'-- 	6$*6+<$=D!!$*$5D!d&9v?TUUUr)   c                     |                      |          }|                     |          }|                     |          }|S r   )rE   rf  rG   r  s     r(   r8   z,LegacyDebertaPredictionHeadTransform.forward  s=    

=11--m<<}55r)   rR   r?   s   @r(   rd  rd    sL        	V 	V 	V 	V 	V      r)   rd  c                   *     e Zd Z fdZd Zd Z xZS )r=  c                 t   t                                                       t          |          | _        t	          |d|j                  | _        t          j        | j        |j	        d          | _
        t          j        t          j        |j	                            | _        | j        | j
        _        d S )Nr   Fr   )r   r   rd  	transformr   rD   r   r   rC   r   decoderr   r   r!   r"   rK   s     r(   r   z&LegacyDebertaLMPredictionHead.__init__  s    =fEE%f.>@RSS y!4f6GeTTTLV->!?!?@@	 !Ir)   c                 (    | j         | j        _         d S r   )r"   rk  rN  s    r(   _tie_weightsz*LegacyDebertaLMPredictionHead._tie_weights  s     Ir)   c                 Z    |                      |          }|                     |          }|S r   )rj  rk  r  s     r(   r8   z%LegacyDebertaLMPredictionHead.forward  s*    }55]33r)   )r:   r;   r<   r   rm  r8   r>   r?   s   @r(   r=  r=    sV        & & & & && & &      r)   r=  c                   B     e Zd Z fdZdej        dej        fdZ xZS )LegacyDebertaOnlyMLMHeadc                 p    t                                                       t          |          | _        d S r   )r   r   r=  predictionsrK   s     r(   r   z!LegacyDebertaOnlyMLMHead.__init__  s/    8@@r)   rb  r   c                 0    |                      |          }|S r   )rr  )r$   rb  prediction_scoress      r(   r8   z LegacyDebertaOnlyMLMHead.forward  s     ,,_==  r)   r  r?   s   @r(   rp  rp    sc        A A A A A!u| ! ! ! ! ! ! ! ! !r)   rp  c                   (     e Zd ZdZ fdZd Z xZS )r>  zMhttps://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j        d          | _        t          j        t          j        |j                            | _        d S )NT)r&   elementwise_affine)r   r   r   rC   rD   rE   r  r  r  r
   rf  rG   rF   r   r   r!   r   r"   rK   s     r(   r   z DebertaLMPredictionHead.__init__#  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>ShlmmmLV->!?!?@@			r)   c                     |                      |          }|                     |          }|                     |          }t          j        ||j                                                  | j        z   }|S r   )rE   rf  rG   r   r   r    r   r"   )r$   r4   r   s      r(   r8   zDebertaLMPredictionHead.forward1  sm    

=11--m<<
 
 ]O4J4L4L4N4NOORVR[[r)   r9   r?   s   @r(   r>  r>     sR        WWA A A A A      r)   r>  c                   $     e Zd Z fdZd Z xZS )DebertaOnlyMLMHeadc                 p    t                                                       t          |          | _        d S r   )r   r   r>  lm_headrK   s     r(   r   zDebertaOnlyMLMHead.__init__<  s,    .v66r)   c                 2    |                      ||          }|S r   )r|  )r$   rb  r   rt  s       r(   r8   zDebertaOnlyMLMHead.forwardA  s     LL/JJ  r)   rR   r?   s   @r(   rz  rz  ;  sG        7 7 7 7 7
! ! ! ! ! ! !r)   rz  c                   >    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee         dee         dee         deeef         fd            Z xZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                 $   t                                          |           |j        | _        t          |          | _        | j        rt          |          | _        nddg| _        t          |          | _	        | 
                                 d S )Nzlm_predictions.lm_head.weightz)deberta.embeddings.word_embeddings.weight)r   r   legacyrF  r5  rp  cls_tied_weights_keysrz  lm_predictionsrK  rK   s     r(   r   zDebertaForMaskedLM.__init__J  s       m#F++; 	=/77DHH'FHs&tD#"4V"<"<D 	r)   c                 T    | j         r| j        j        j        S | j        j        j        S r   )r  r  rr  rk  r  r|  rE   rN  s    r(   get_output_embeddingsz(DebertaForMaskedLM.get_output_embeddingsW  s)    ; 	58'//&.44r)   c                     | j         r)|| j        j        _        |j        | j        j        _        d S || j        j        _        |j        | j        j        _        d S r   )r  r  rr  rk  r"   r  r|  rE   rQ  s     r(   set_output_embeddingsz(DebertaForMaskedLM.set_output_embeddings]  sU    ; 	C+9DH ((6(;DH %%%0>D'-/=/BD',,,r)   Nr   r   r   r   r   labelsr   r%  r&  r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }| j        r|                     |          }n%|                     || j        j        j                  }d}|Kt                      } ||	                    d| j         j
                  |	                    d                    }|	s|f|
dd         z   }||f|z   n|S t          |||
j        |
j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   r   r   r%  r&  r   r+   r   losslogitsr4   r,  )rL   rZ  r5  r  r  r  r   r   r   rY   r   r   r4   r,  )r$   r   r   r   r   r   r  r   r%  r&  outputsrb  rt  masked_lm_lossloss_fctr   s                   r(   r8   zDebertaForMaskedLM.forwarde  s=   ( &1%<kk$+B],,))%'/!5#  	
 	
 "!*; 	n $ 9 9 $ 3 3OT\E\El m m'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r)   	NNNNNNNNN)r:   r;   r<   r  r   r  r  r   r   r   r   r   r   r   r   r8   r>   r?   s   @r(   r  r  F  sI       :<Z[    5 5 5C C C  -11515/304)-,0/3&*4
 4
EL)4
 !.4
 !.	4

 u|,4
  -4
 &4
 $D>4
 'tn4
 d^4
 
un$	%4
 4
 4
 ^4
 4
 4
 4
 4
r)   r  c                   :     e Zd Z fdZd Zed             Z xZS )ContextPoolerc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        || _	        d S r   )
r   r   r   rC   pooler_hidden_sizerE   rH   pooler_dropoutrJ   rL   rK   s     r(   r   zContextPooler.__init__  sQ    Yv8&:STT
z&"788r)   c                     |d d df         }|                      |          }|                     |          }t          | j        j                 |          }|S rH  )rJ   rE   r
   rL   pooler_hidden_act)r$   r4   context_tokenpooled_outputs       r(   r8   zContextPooler.forward  sU     &aaad+]33

=11t{<=mLLr)   c                     | j         j        S r   )rL   rD   rN  s    r(   
output_dimzContextPooler.output_dim  s    {&&r)   )r:   r;   r<   r   r8   propertyr  r>   r?   s   @r(   r  r    sb               ' ' X' ' ' ' 'r)   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   6    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddeej	                 deej	                 deej	                 deej	                 d	eej	                 d
eej	                 dee
         dee
         dee
         deeef         fd            Z xZS ) DebertaForSequenceClassificationc                    t                                          |           t          |dd          }|| _        t	          |          | _        t          |          | _        | j        j        }t          j
        ||          | _        t          |dd           }|| j        j        n|}t          j        |          | _        |                                  d S )N
num_labelsr-   cls_dropout)r   r   r   r  rF  r5  r  poolerr  r   rC   
classifierrL   rI   rH   rJ   rK  )r$   rL   r  r  drop_outr'   s        r(   r   z)DebertaForSequenceClassification.__init__  s       V\155
$#F++#F++[+
)J
;;6=$776>6F4;22Hz(++ 	r)   c                 4    | j                                         S r   )r5  rO  rN  s    r(   rO  z5DebertaForSequenceClassification.get_input_embeddings  s    |00222r)   c                 :    | j                             |           d S r   )r5  rS  rQ  s     r(   rS  z5DebertaForSequenceClassification.set_input_embeddings  s    )).99999r)   Nr   r   r   r   r   r  r   r%  r&  r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }|                     |          }|                     |          }d}|| j         j        (| j        dk    rat          j	                    }|
                    d                              |j                  } |||
                    d                    }n|                                dk    s|                    d          dk    rA|dk                                    }|                                }|                    d          dk    rt#          j        |d|                    |                    d          |                    d                              }t#          j        |d|
                    d                    }t)                      } ||
                    d| j                                                  |
                    d                    }nZt#          j        d                              |          }n1t          j        d          } ||          |z                      d                                           }n| j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt)                      } ||
                    d| j                  |
                    d                    }n*| j         j        dk    rt7                      } |||          }|	s|f|
dd         z   }||f|z   n|S t9          |||
j        |
j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r   r   r%  r&  r   r   r+   
regressionsingle_label_classificationmulti_label_classificationr  )rL   rZ  r5  r  rJ   r  problem_typer  r   r   rY   r3   r.   r   r%   nonzerorX   r   r   rf   r   r/   rs   
LogSoftmaxsumr0   r   r   r   r4   r,  )r$   r   r   r   r   r   r  r   r%  r&  r  encoder_layerr  r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxr   s                        r(   r8   z(DebertaForSequenceClassification.forward  s   & &1%<kk$+B],,))%'/!5#  	
 	
  
M22]33//{'/?a'' jllG#[[__//==F"766;;r??;;DDZZ\\Q&&&++b//Q*>*>#)Q;"7"7"9"9K#[[]]F"''**Q..)."A{'9'9+:J:J1:M:Mv{{[\~~'^'^* * "'fa9I9I"9M9M!N!N#3#5#5'x(;(;B(P(P(V(V(X(XZ`ZeZefhZiZijj$|A11&99"$-"3"3K)k&11F:??CCIIKKKDD)\99"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'fG4IV]Vh
 
 
 	
r)   r  )r:   r;   r<   r   rO  rS  r   r   r   r   r   r   r   r   r8   r>   r?   s   @r(   r  r    sN           $3 3 3: : :  -11515/304)-,0/3&*M
 M
EL)M
 !.M
 !.	M

 u|,M
  -M
 &M
 $D>M
 'tnM
 d^M
 
u..	/M
 M
 M
 ^M
 M
 M
 M
 M
r)   r  c                   *    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	ee         d
ee         dee         de	e
ef         fd            Z xZS )DebertaForTokenClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r   )r   r   r  rF  r5  r   rH   rI   rJ   rC   rD   r  rK  rK   s     r(   r   z&DebertaForTokenClassification.__init__%  sy        +#F++z&"<==)F$68IJJ 	r)   Nr   r   r   r   r   r  r   r%  r&  r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|	s|f|
dd         z   }||f|z   n|S t          |||
j	        |
j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r+   r   r  )rL   rZ  r5  rJ   r  r   rY   r  r   r4   r,  )r$   r   r   r   r   r   r  r   r%  r&  r  rb  r  r  r  r   s                   r(   r8   z%DebertaForTokenClassification.forward0  s   " &1%<kk$+B],,))%'/!5#  	
 	
 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$fG4IV]Vh
 
 
 	
r)   r  )r:   r;   r<   r   r   r   r   r   r   r   r   r   r8   r>   r?   s   @r(   r  r  #  s       	 	 	 	 	  -11515/304)-,0/3&*-
 -
EL)-
 !.-
 !.	-

 u|,-
  --
 &-
 $D>-
 'tn-
 d^-
 
u++	,-
 -
 -
 ^-
 -
 -
 -
 -
r)   r  c                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ef         fd            Z xZS )DebertaForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S r   )
r   r   r  rF  r5  r   rC   rD   
qa_outputsrK  rK   s     r(   r   z$DebertaForQuestionAnswering.__init__c  se        +#F++)F$68IJJ 	r)   Nr   r   r   r   r   start_positionsend_positionsr   r%  r&  r   c           
         |
|
n| j         j        }
|                     |||||||	|
          }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|
s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nr  r   r   r+   r   )ignore_indexr-   )r  start_logits
end_logitsr4   r,  )rL   rZ  r5  r  splitr   r   r   r%   r   r   r   r4   r,  )r$   r   r   r   r   r   r  r  r   r%  r&  r  rb  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r(   r8   z#DebertaForQuestionAnswering.forwardm  s    &1%<kk$+B],,))%'/!5#  	
 	
 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r)   )
NNNNNNNNNN)r:   r;   r<   r   r   r   r   r   r   r   r   r   r8   r>   r?   s   @r(   r  r  a  s1             -11515/3042604,0/3&*<
 <
EL)<
 !.<
 !.	<

 u|,<
  -<
 "%,/<
  -<
 $D><
 'tn<
 d^<
 
u22	3<
 <
 <
 ^<
 <
 <
 <
 <
r)   r  )r  r  r  r  rF  r4  )Ar=   typingr   r   r   r   torch.nnr   r   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_debertar   
get_loggerr:   loggerModuler   rA   jitscriptrc   ri   rk   ro   r   r   rt   rw   r|   r   r   r   r   r   r	  r  r  r4  rF  rd  r=  rp  r>  rz  r  r  r  r  r  __all__r   r)   r(   <module>r     s     " " " " " " " "        A A A A A A A A A A ! ! ! ! ! ! 9 9 9 9 9 9              . - - - - - , , , , , , , , 0 0 0 0 0 0 
	H	%	%    ry   (    	      8 r r r n n n [ [ [ \%, \c \ \ \ \ EL U\     d d dgj d d d d      C C C C C	 C C CLN N N N N	 N N Nb, , , , ,ry , , ,F    ")       BI   ( ( ( ( (- ( ( (BO
 O
 O
 O
 O
RY O
 O
 O
d % % % % %_ % % %8 g
 g
 g
 g
 g
) g
 g
 g
T    29   &    BI   2! ! ! ! !ry ! ! !    bi   6! ! ! ! ! ! ! ! S
 S
 S
 S
 S
/ S
 S
 S
l' ' ' ' 'BI ' ' ',   g
 g
 g
 g
 g
'= g
 g
 g
T :
 :
 :
 :
 :
$: :
 :
 :
z H
 H
 H
 H
 H
"8 H
 H
 H
V  r)   