
     `io                      d Z ddlmZ ddlZddlmZ ddlZddlZ	ddl
mZ ddlmZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZmZmZ dd
l m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e$j'        e(          Z)dZ*dZ+ G d dej,        j-                  Z. G d dej,        j-                  Z/ G d dej,        j-                  Z0 G d dej,        j-                  Z1 G d dej,        j-                  Z2 G d dej,        j-                  Z3 G d dej,        j-                  Z4 G d dej,        j-                  Z5 G d d ej,        j-                  Z6 G d! d"ej,        j-                  Z7d# Z8d$ Z9d% Z:d& Z;d' Z< G d( d)ej,        j-                  Z= G d* d+ej,        j-                  Z> G d, d-ej,        j-                  Z? G d. d/ej,        j-                  Z@ G d0 d1ej,        j-                  ZA G d2 d3ej,        j-                  ZB G d4 d5e          ZCd6ZDd7ZE e"d8eD           G d9 d:eC                      ZF e"d;eD           G d< d=eCe                      ZG e"d>eD           G d? d@eCe                      ZH e"dAeD           G dB dCeCe                      ZI e"dDeD           G dE dFeCe                      ZJg dGZKdS )HzTF 2.0 DeBERTa model.    )annotationsN)Sequence   )get_tf_activation)TFBaseModelOutputTFMaskedLMOutputTFQuestionAnsweringModelOutputTFSequenceClassifierOutputTFTokenClassifierOutput)	TFMaskedLanguageModelingLossTFModelInputTypeTFPreTrainedModelTFQuestionAnsweringLossTFSequenceClassificationLossTFTokenClassificationLossget_initializerkerasunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zkamalkraj/deberta-basec                  J     e Zd Zd fdZdddZedd
            ZddZ xZS )TFDebertaContextPoolerconfigr   c                     t                      j        di | t          j                            |j        d          | _        t          |j        d          | _	        || _
        d S )Ndensenamedropout )super__init__r   layersDensepooler_hidden_sizer"   TFDebertaStableDropoutpooler_dropoutr%   r    selfr    kwargs	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/deberta/modeling_tf_deberta.pyr(   zTFDebertaContextPooler.__init__9   sa    ""6"""\''(A'PP
-f.C)TTT    Ftrainingboolc                    |d d df         }|                      ||          }|                     |          } t          | j        j                  |          }|S )Nr   r4   )r%   r"   r   r    pooler_hidden_act)r/   hidden_statesr4   context_tokenpooled_outputs        r2   callzTFDebertaContextPooler.call?   s_     &aaad+]XFF

=11H)$+*GHHWWr3   returnintc                    | j         j        S N)r    hidden_sizer/   s    r2   
output_dimz!TFDebertaContextPooler.output_dimH   s    {&&r3   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           St          j        | j	        j                  5  | j	                            d            d d d            d S # 1 swxY w Y   d S d S )NTr"   r%   )
builtgetattrtf
name_scoper"   r$   buildr    r+   r%   r/   input_shapes     r2   rI   zTFDebertaContextPooler.buildL   so   : 	F
4$''3tz// O O
  $dk.L!MNNNO O O O O O O O O O O O O O O4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) 65s$    (A44A8;A8.CCCr    r   Fr4   r5   )r=   r>   r@   )	__name__
__module____qualname__r(   r<   propertyrC   rI   __classcell__r1   s   @r2   r   r   8   s                  ' ' ' X'	) 	) 	) 	) 	) 	) 	) 	)r3   r   c                  ,     e Zd ZdZd fd	Zd	dZ xZS )
TFDebertaXSoftmaxa>  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`tf.Tensor`): The input tensor that will apply softmax.
        mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax
    c                H     t                      j        di | || _        d S Nr&   )r'   r(   axis)r/   rZ   r0   r1   s      r2   r(   zTFDebertaXSoftmax.__init__b   s+    ""6"""			r3   inputs	tf.Tensormaskc                r   t          j        t          j        |t           j                            }t          j        |t          j        t          d          | j                  |          }t          t          j        |t           j                  | j	                  }t          j        |d|          }|S )Nz-infdtype        )
rG   logical_notcastr5   wherefloatcompute_dtyper   float32rZ   )r/   r[   r]   rmaskoutputs        r2   r<   zTFDebertaXSoftmax.callf   s    rwtRW5566%vd>P!Q!Q!QSYZZbj A A A49MM%f--r3   )rW   )r[   r\   r]   r\   )rO   rP   rQ   __doc__r(   r<   rS   rT   s   @r2   rV   rV   X   s[                     r3   rV   c                  L     e Zd ZdZ fdZej        d             Zd	d
dZ xZ	S )r,   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                H     t                      j        di | || _        d S rY   )r'   r(   	drop_prob)r/   rm   r0   r1   s      r2   r(   zTFDebertaStableDropout.__init__v   s+    ""6""""r3   c                    t          j        dt           j        j        j                            d j        z
                                t          |                    z
  t           j	                  t          j
        dd j        z
  z   j                   j        dk    r2t          j        t          j        d j                  |          z  } fd}||fS )	z~
        Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
        r   g      ?)probs)sample_shaper_   r   ra   c                    j         dk    r2t          j        t          j        dj                  |           z  S | S )Nr   ra   r_   )rm   rG   rd   rc   rf   )upstreamr]   scaler/   s    r2   gradz-TFDebertaStableDropout.xdropout.<locals>.grad   sA    ~!!xbgc9K&L&L&LhWWZ___r3   )rG   rc   compatv1distributions	Bernoullirm   sampler   r5   convert_to_tensorrf   rd   )r/   r[   rt   r]   rs   s   `  @@r2   xdropoutzTFDebertaStableDropout.xdropoutz   s    
 wil(22t~9M2NNUUcmntcucuUvvwG
 

 $SA,>%?tGYZZZ>AXdBGCt7I$J$J$JFSSV[[F	  	  	  	  	  	  	  t|r3   Fr[   r\   r4   c                4    |r|                      |          S |S r@   )r{   )r/   r[   r4   s      r2   r<   zTFDebertaStableDropout.call   s!     	)==(((r3   rM   )r[   r\   r4   r\   )
rO   rP   rQ   rj   r(   rG   custom_gradientr{   r<   rS   rT   s   @r2   r,   r,   n   sz         # # # # #   *        r3   r,   c                  6     e Zd ZdZd	 fd	Z fdZd
dZ xZS )TFDebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).-q=c                V     t                      j        di | || _        || _        d S rY   )r'   r(   sizeeps)r/   r   r   r0   r1   s       r2   r(   zTFDebertaLayerNorm.__init__   s2    ""6"""	r3   c                   |                      | j        gt          j                    d          | _        |                      | j        gt          j                    d          | _        t                                          |          S )Nweight)shapeinitializerr$   bias)	
add_weightr   rG   ones_initializergammazeros_initializerbetar'   rI   )r/   rK   r1   s     r2   rI   zTFDebertaLayerNorm.build   sg    __DI;BDWDYDY`h_ii
OO49+2CWCYCY`fOgg	ww}}[)))r3   xr\   r=   c                
   t          j        |dgd          }t          j        t          j        ||z
            dgd          }t           j                            || j        z             }| j        ||z
  z  |z  | j        z   S )NrW   T)rZ   keepdims)rG   reduce_meansquaremathsqrtr   r   r   )r/   r   meanvariancestds        r2   r<   zTFDebertaLayerNorm.call   sw    ~ardT:::>")AH"5"5RD4PPPgll8dh.//zQX&,ty88r3   )r   )r   r\   r=   r\   rO   rP   rQ   rj   r(   rI   r<   rS   rT   s   @r2   r   r      sp        LL     
* * * * *
9 9 9 9 9 9 9 9r3   r   c                  2     e Zd Zd
 fdZdddZdd	Z xZS )TFDebertaSelfOutputr    r   c                *    t                      j        di | t          j                            |j        d          | _        t          j                            |j        d          | _	        t          |j        d          | _        || _        d S )Nr"   r#   	LayerNormepsilonr$   r%   r&   )r'   r(   r   r)   r*   rA   r"   LayerNormalizationlayer_norm_epsr   r,   hidden_dropout_probr%   r    r.   s      r2   r(   zTFDebertaSelfOutput.__init__   s    ""6"""\''(:'II
88AV]h8ii-f.HyYYYr3   Fr4   r5   c                    |                      |          }|                     ||          }|                     ||z             }|S )Nr7   r"   r%   r   r/   r9   input_tensorr4   s       r2   r<   zTFDebertaSelfOutput.call   sD    

=11]XFF}|'CDDr3   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j	        j                  5  | j	                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           St          j        | j
        j                  5  | j
                            d            d d d            d S # 1 swxY w Y   d S d S NTr"   r   r%   )rE   rF   rG   rH   r"   r$   rI   r    rA   r   r%   rJ   s     r2   rI   zTFDebertaSelfOutput.build   s   : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) 656    (A44A8;A8.(C""C&)C&EEErL   rM   rN   r@   rO   rP   rQ   r(   r<   rI   rS   rT   s   @r2   r   r      sj                 ) ) ) ) ) ) ) )r3   r   c                  <     e Zd Zd fdZ	 	 	 	 	 dddZddZ xZS )TFDebertaAttentionr    r   c                     t                      j        di | t          |d          | _        t	          |d          | _        || _        d S )Nr/   r#   ri   r&   )r'   r(   "TFDebertaDisentangledSelfAttentionr/   r   dense_outputr    r.   s      r2   r(   zTFDebertaAttention.__init__   sT    ""6"""6vFKKK	/XFFFr3   NFr   r\   attention_maskquery_statestf.Tensor | Nonerelative_posrel_embeddingsoutput_attentionsr5   r4   r=   tuple[tf.Tensor]c           	         |                      |||||||          }||}|                     |d         ||          }	|	f|dd          z   }
|
S )Nr9   r   r   r   r   r   r4   r   r9   r   r4   r   )r/   r   )r/   r   r   r   r   r   r   r4   self_outputsattention_outputri   s              r2   r<   zTFDebertaAttention.call   s     yy&)%%)/ ! 
 
 'L,,&q/x - 
 
 #$|ABB'77r3   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr/   r   )rE   rF   rG   rH   r/   r$   rI   r   rJ   s     r2   rI   zTFDebertaAttention.build   sR   : 	F
4&&2ty~.. & &	%%%& & & & & & & & & & & & & & &4..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:$    A''A+.A+!C		CCrL   NNNFF)r   r\   r   r\   r   r   r   r   r   r   r   r5   r4   r5   r=   r   r@   r   rT   s   @r2   r   r      s{              *.)-+/"'    :	. 	. 	. 	. 	. 	. 	. 	.r3   r   c                  0     e Zd Zd
 fdZddZdd	Z xZS )TFDebertaIntermediater    r   c                D    t                      j        di | t          j                            |j        t          |j                  d          | _        t          |j
        t                    rt          |j
                  | _        n|j
        | _        || _        d S )Nr"   unitskernel_initializerr$   r&   )r'   r(   r   r)   r*   intermediate_sizer   initializer_ranger"   
isinstance
hidden_actstrr   intermediate_act_fnr    r.   s      r2   r(   zTFDebertaIntermediate.__init__   s    ""6"""\''*vOg?h?hov ( 
 

 f'-- 	9'89J'K'KD$$'-'8D$r3   r9   r\   r=   c                \    |                      |          }|                     |          }|S Nr[   )r"   r   r/   r9   s     r2   r<   zTFDebertaIntermediate.call  s.    

-
8800??r3   Nc                   | j         rd S d| _         t          | dd           `t          j        | j        j                  5  | j                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )NTr"   )	rE   rF   rG   rH   r"   r$   rI   r    rA   rJ   s     r2   rI   zTFDebertaIntermediate.build  s    : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H H H H 43s    (A55A9<A9rL   r9   r\   r=   r\   r@   r   rT   s   @r2   r   r      sm                H H H H H H H Hr3   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFDebertaOutputr    r   c                P    t                      j        di | t          j                            |j        t          |j                  d          | _        t          j        	                    |j
        d          | _        t          |j        d          | _        || _        d S )Nr"   r   r   r   r%   r#   r&   )r'   r(   r   r)   r*   rA   r   r   r"   r   r   r   r,   r   r%   r    r.   s      r2   r(   zTFDebertaOutput.__init__  s    ""6"""\''$Ia9b9bip ( 
 

 88AV]h8ii-f.HyYYYr3   Fr9   r\   r   r4   r5   r=   c                    |                      |          }|                     ||          }|                     ||z             }|S )Nr   r7   r   r   s       r2   r<   zTFDebertaOutput.call  sF    

-
88]XFF}|'CDDr3   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j	        j                  5  | j	                            d d | j        j
        g           d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S r   )rE   rF   rG   rH   r"   r$   rI   r    r   r   rA   r%   rJ   s     r2   rI   zTFDebertaOutput.build&  s   : 	F
4$''3tz// N N
  $dk.K!LMMMN N N N N N N N N N N N N N N4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) 65r   rL   rM   )r9   r\   r   r\   r4   r5   r=   r\   r@   r   rT   s   @r2   r   r     sj                 ) ) ) ) ) ) ) )r3   r   c                  <     e Zd Zd fdZ	 	 	 	 	 dddZddZ xZS )TFDebertaLayerr    r   c                     t                      j        di | t          |d          | _        t	          |d          | _        t          |d          | _        d S )N	attentionr#   intermediateri   r&   )r'   r(   r   r   r   r   r   bert_outputr.   s      r2   r(   zTFDebertaLayer.__init__6  sd    ""6"""+FEEE1&~NNN*6AAAr3   NFr9   r\   r   r   r   r   r   r   r5   r4   r=   r   c           	         |                      |||||||          }|d         }	|                     |	          }
|                     |
|	|          }|f|dd          z   }|S )N)r   r   r   r   r   r   r4   r   r9   r   r   )r   r   r   )r/   r9   r   r   r   r   r   r4   attention_outputsr   intermediate_outputlayer_outputoutputss                r2   r<   zTFDebertaLayer.call=  s     !NN&)%%)/ + 
 
 -Q/"//>N/OO''-<LW_ ( 
 
  /$5abb$99r3   c                r   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   )	rE   rF   rG   rH   r   r$   rI   r   r   rJ   s     r2   rI   zTFDebertaLayer.buildY  s   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4..:t0566 . .!''---. . . . . . . . . . . . . . .4--9t/455 - - &&t,,,- - - - - - - - - - - - - - - - - - :9s6    A''A+.A+!CCCD**D.1D.rL   r   r9   r\   r   r\   r   r   r   r   r   r   r   r5   r4   r5   r=   r   r@   r   rT   s   @r2   r   r   5  s        B B B B B B *.)-+/"'    8- - - - - - - -r3   r   c                  R     e Zd Zd fdZddZd Zd ZddZ	 	 	 	 	 	 dddZ xZ	S )TFDebertaEncoderr    r   c                0    t                      j        di | fdt          j                  D             | _        t          dd          | _        | _        | j        r/t          dd          | _        | j        dk     rj	        | _        d S d S d S )Nc                8    g | ]}t          d |           S )zlayer_._r#   )r   ).0ir    s     r2   
<listcomp>z-TFDebertaEncoder.__init__.<locals>.<listcomp>l  s,    kkkanV.Q..AAAkkkr3   relative_attentionFmax_relative_positionsrW   r   r&   )
r'   r(   rangenum_hidden_layerslayerrF   r   r    r   max_position_embeddingsr.   s    ` r2   r(   zTFDebertaEncoder.__init__i  s    ""6"""kkkk5QWQiKjKjkkk
")&2F"N"N" 	M*1&:RTV*W*WD'*Q...4.L+++	M 	M..r3   Nc                   | j         rd S d| _         | j        rH|                     d| j        dz  | j        j        gt          | j        j                            | _        t          | dd           P| j
        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTzrel_embeddings.weight   r$   r   r   r   )rE   r   r   r   r    rA   r   r   r   rF   r   rG   rH   r$   rI   )r/   rK   r   s      r2   rI   zTFDebertaEncoder.buildt  s   : 	F
" 	"&//,2Q68OP+DK,IJJ #2 # #D
 4$''3 & &]5:.. & &KK%%%& & & & & & & & & & & & & & & 43& &s   B55B9	<B9	c                &    | j         r| j        nd }|S r@   )r   r   )r/   r   s     r2   get_rel_embeddingz"TFDebertaEncoder.get_rel_embedding  s    040GQ,,Tr3   c                   t          t          |                    dk    rst          j        t          j        |d          d          }|t          j        t          j        |d          d          z  }t          j        |t          j                  }n5t          t          |                    dk    rt          j        |d          }|S )Nr   r   rW   r   )lenr   rG   expand_dimssqueezerc   uint8)r/   r   extended_attention_masks      r2   get_attention_maskz#TFDebertaEncoder.get_attention_mask  s    z.))**a//&(nR^NTU5V5VXY&Z&Z#4r~bjQhjlFmFmoq7r7rrNW^RX>>NNN++,,11^NA>>Nr3   c                    | j         rQ|O|t          |          d         nt          |          d         }t          |t          |          d                   }|S )Nr   )r   r   build_relative_position)r/   r9   r   r   qs        r2   get_rel_poszTFDebertaEncoder.get_rel_pos  sa    " 	U|';0<0H
<((,,jYfNgNghjNkA21j6O6OPR6STTLr3   FTr9   r\   r   r   r   r   r   r5   output_hidden_statesreturn_dictr4   r=   $TFBaseModelOutput | tuple[tf.Tensor]c	           
     f   |rdnd }	|rdnd }
|                      |          }|                     |||          }t          |t                    r	|d         }n|}|                                 }t          | j                  D ]y\  }}|r|	|fz   }	 ||||||||          }|d         }|@|}t          |t                    r(|dz   t          | j                  k     r||dz            nd }n|}|r|
|d         fz   }
z|r|	|fz   }	|st          d ||	|
fD                       S t          ||	|
          S )Nr&   r   r   r   c              3     K   | ]}||V  	d S r@   r&   )r   vs     r2   	<genexpr>z(TFDebertaEncoder.call.<locals>.<genexpr>  s(      hhqZ[ZgZgZgZgZghhr3   last_hidden_stater9   
attentions)
r   r   r   r   r   	enumerater   r   tupler   )r/   r9   r   r   r   r   r  r  r4   all_hidden_statesall_attentionsnext_kvr   r   layer_modulelayer_outputss                   r2   r<   zTFDebertaEncoder.call  s    #7@BBD0:d00@@''|\RRmX.. 	$#A&GG#G//11(44 	F 	FOA|# I$58H$H!(L%-))-"3!  M *!,M',mX66 X67!ec$*oo6M6MmAE22SWG'  F!/=3C2E!E   	E 1]4D D 	ihh]4E~$Vhhhhhh +;LYg
 
 
 	
r3   rL   r@   )NN)NNFFTF)r9   r\   r   r\   r   r   r   r   r   r5   r  r5   r  r5   r4   r5   r=   r  )
rO   rP   rQ   r(   rI   r   r   r   r<   rS   rT   s   @r2   r   r   h  s        	M 	M 	M 	M 	M 	M& & & &        *.)-"'%* :
 :
 :
 :
 :
 :
 :
 :
 :
r3   r   c                z   t          j        | t           j                  }t          j        |t           j                  }|dddf         t          j        t          j        |ddg          | dg          z
  }|d| ddf         }t          j        |d          }t          j        |t           j                  S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `tf.Tensor`: A tensor with shape [1, query_size, key_size]

    r_   Nr   rW   r   rZ   )rG   r   int32tilereshaper   rc   int64)
query_sizekey_sizeq_idsk_idsrel_pos_idss        r2   r   r     s      HZrx000EHXRX...E4.272:eaW+E+E
TU#W#WWKkzk111n-K.1555K7;)))r3   c                    t          |          d         t          |          d         t          |          d         t          |          d         g}t          j        | |          S )Nr   r   r   rW   r   rG   broadcast_to)c2p_posquery_layerr   shapess       r2   c2p_dynamic_expandr#    s]    ;";";"<  $	F ?7F+++r3   c                    t          |          d         t          |          d         t          |          d         t          |          d         g}t          j        | |          S )Nr   r   r   r  )r   r!  	key_layerr"  s       r2   p2c_dynamic_expandr&    s]    ;";"9b!9b!	F ?7F+++r3   c                    t          |          d d         t          |           d         t          |          d         gz   }t          j        | |          S )Nr   r   r  )	pos_indexp2c_attr%  r"  s       r2   pos_dynamic_expandr*     sO      !$
9(=(=b(A:iCXCXY[C\']]F?9f---r3   c                f   |dk     rt          j        |           |z   }|t          j        |           dz
  k    rt          j        |           dz
  |z
  }t          j        t          j        t          j        |                     |d          }t          j        | |          } t          j        ||          }nd}t          j        | dt          j        |           d         f          }t          j        |dt          j        |          d         f          }t          j        ||d          }t          j        |t          j        |                    }|dk    rRt          j        t          j        t          j        |                     | d          }t          j        ||          }|S )Nr   r   r  permrW   )
batch_dims)rG   rankrollr   	transposer  r   gather)r   indicesgather_axispre_rollpermutationflat_xflat_indicesgathereds           r2   torch_gatherr:    se   Qgajj;.bgajj1n$$71::>K/gbhrwqzz22H1EEEL---,w[999ZBB011F:gBHW,=,=b,A'BCCLy!<<<Hz(BHW$5$566H1}}gbhrwqzz22XIAFFF<{;;;Or3   c                  N     e Zd ZdZd fdZddZdd
Z	 	 	 	 	 dddZd Z xZ	S )r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    r    r   c                \    t                      j        di | |j        |j        z  dk    r t	          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j	        
                    | j        dz  t          |j                  dd          | _        |j        |j        ng | _        t          |d	d          | _        t          |d
d          | _        | j        r~t          j	        
                    | j        t          |j                  dd          | _        t          j	        
                    | j        t          |j                  dd          | _        t)          d          | _        | j        rt          |dd          | _        | j        dk     r|j        | _        t1          |j        d          | _        d| j        v r?t          j	        
                    | j        t          |j                  dd          | _        d| j        v r>t          j	        
                    | j        t          |j                  d          | _        t1          |j        d          | _        || _        d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   in_projFr   r$   use_biasr   talking_headhead_logits_projhead_weights_projrW   r  r   r   pos_dropoutr#   c2ppos_projp2c
pos_q_proj)r   r$   r%   r&   ) r'   r(   rA   num_attention_heads
ValueErrorr>   attention_head_sizeall_head_sizer   r)   r*   r   r   r>  pos_att_typerF   r   rA  rB  rC  rV   softmaxr   r   r,   r   rD  rF  rH  attention_probs_dropout_probr%   r    r.   s      r2   r(   z+TFDebertaDisentangledSelfAttention.__init__(  s   ""6""" ::a??8F$6 8 8 48 8 8   $*#= #&v'9F<V'V#W#W !58PP|))".v/GHH	 * 
 
 4:3F3RF//XZ")&2F"N"N#FNEBB 	$)L$6$6(#263K#L#L'	 %7 % %D! &+\%7%7(#263K#L#L(	 &8 & &D" )b111" 	*1&:RTV*W*WD'*Q...4.L+5f6PWdeeeD))) % 2 2&'6v7O'P'P#"	 !3 ! ! )))"',"4"4&?6Kc;d;dkw #5 # # .f.QXabbbr3   Nc                   | j         rd S d| _         |                     d| j        t          j                                                  | _        |                     d| j        t          j                                                  | _        t          | dd           ]t          j
        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           Pt          j
        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j
        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j
        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | d	d           Pt          j
        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | d
d           [t          j
        | j        j                  5  | j                            | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ^t          j
        | j        j                  5  | j                            | j        j        g           d d d            d S # 1 swxY w Y   d S d S )NTq_biasr   v_biasr>  r%   rB  rC  rD  rF  rH  )rE   r   rL  r   initializersZerosrQ  rR  rF   rG   rH   r>  r$   rI   r    rA   r%   rB  rC  rD  rF  rH  rJ   s     r2   rI   z(TFDebertaDisentangledSelfAttention.builda  s   : 	F
oo$"45CUC[C[C]C] & 
 
 oo$"45CUC[C[C]C] & 
 
 4D))5t|011 J J""D$0G#HIIIJ J J J J J J J J J J J J J J4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4+T22>t49:: 2 2%++D1112 2 2 2 2 2 2 2 2 2 2 2 2 2 24,d33?t5:;; 3 3&,,T2223 3 3 3 3 3 3 3 3 3 3 3 3 3 34--9t/455 - - &&t,,,- - - - - - - - - - - - - - -4T**6t}122 ? ?##T[%<$=>>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ?4t,,8t344 A A%%t{'>&?@@@A A A A A A A A A A A A A A A A A A 98s~   <(C00C47C4*EEEF22F69F6,HHHI44I8;I8.&K  K$'K$&MMMtensorr\   r=   c                    t          |          d d         | j        dgz   }t          j        ||          }t          j        |g d          S )NrW   rU  r   r   r   r   r   r,  )r   rI  rG   r  r1  )r/   rU  r   s      r2   transpose_for_scoresz7TFDebertaDisentangledSelfAttention.transpose_for_scores  sT    6""3B3'4+CR*HH6777 |F6666r3   Fr9   r   r   r   r   r   r   r5   r4   r   c           	     
   |E|                      |          }t          j        |                     |          dd          \  }	}
}nd }t          j        t          j        | j         j        d                   | j        dz  d          }t          j        | j        d          }t          j	        d          D ]}t          j        | j        | j                  }t          j	        | j                  D ]$}|
                    |||dz  |z                      }%|
                    ||                                          }dgdz  } ||d         |d         |          } ||d         |d         |          } ||d	         |d	         |          }|                     |          }	|                     |          }
|                     |          }|	|                     | j        ddddf                   z   }	||                     | j        ddddf                   z   }d}dt          | j                  z   }t!          j        t%          |	          d         |z            }|	|z  }	t          j        |	t          j        |
g d
                    }| j        r0|                     ||          }|                     |	|
|||          }|||z   }| j        r?t          j        |                     t          j        |g d                    g d          }|                     ||          }|                     ||          }| j        r?t          j        |                     t          j        |g d                    g d          }t          j        ||          }t          j        |g d          }t%          |          }|dd         |d         |d         z  gz   }t          j        ||          }|r||fn|f}|S )a  
        Call the module

        Args:
            hidden_states (`tf.Tensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`tf.Tensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            return_att (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`tf.Tensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`tf.Tensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   rW   )num_or_size_splitsrZ   c                f    t          j        || d          }||t          j        |          z  }|S )NT)transpose_b)rG   matmulr1  )wbr   outs       r2   linearz7TFDebertaDisentangledSelfAttention.call.<locals>.linear  s4    i1$777=2<??*C
r3   r   )r`   r   r   r   r   r   r   r   r7   )r   r   r   r   )r   r   r   r   rX  r   )r>  rG   splitrY  r1  r   rI  TensorArrayr`   r   writeconcatrQ  rR  r   rM  r   r   r   r^  r   rD  disentangled_att_biasrA  rB  rN  r%   rC  r  )r/   r9   r   r   r   r   r   r4   qpr!  r%  value_layerrb  wsqkvwkqkvw_insider   qkvbr   r  rel_attscale_factorrs   attention_scoresattention_probscontext_layercontext_layer_shapenew_context_layer_shaper   s                                 r2   r<   z'TFDebertaDisentangledSelfAttention.call  s7   N m,,B24())"--!"3 3 3/KKK
   T\0344IadeIelm  B >
;;;DXa[[ ; ; n4:DD\]]]$":;; F FA"-"3"3Ar!a%!)}"E"EKKzz![%7%7%9%9::6A:DtAwQ66AtAwQ77AtAwQ77A33A66K11!44I33A66K!D$=$=dk$PTVWVWVW->X$Y$YY!D$=$=dk$PTVWVWVW->X$Y$YY3t0111	*[11"5DEE!E)9[",y,,,2W2WXX" 	u!--nx-PPN00iWegsttG/'9 	!|%%bl3C\\\&R&RSSUaUaUa    ,,'7HH,,,JJ 	 l&&r|O\\\'R'RSSUaUaUa O 	/;??]LLLAA(77
 #6crc":>QRT>UXklnXo>o=p"p
=2IJJ6G]=/22mM]r3   c           
        |8t          |          d         }t          |t          |          d                   }t          |          }t          |          dk    r)t          j        t          j        |d          d          }n[t          |          dk    rt          j        |d          }n2t          |          dk    rt          dt          |                     t          j        t          j        t          j        t          |          d         t          |          d                   | j	                  t          j
                  }t          j        || j	        |z
  | j	        |z   d d f         d          }d}	d| j        v r|                     |          }
|                     |
          }
t          j        |t          j        |
g d	                    }t          j        ||z   d|dz  dz
            }t#          |t%          |||          d
          }|	|z  }	d| j        v r|                     |          }|                     |          }|t          j                            t          j        t          |          d
         |z  | j                            z  }t          |          d         t          |          d         k    r7t          t          |          d         t          |          d                   }n|}t          j        | |z   d|dz  dz
            }t          j        |t          j        |g d	                    }t          j        t#          |t/          |||          d
          g d	          }t          |          d         t          |          d         k    rEt          j        |d d d d d d df         d
          }t#          |t1          |||          d          }|	|z  }	|	S )Nr   r   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. rE  rc  rW   rG  r_   )r   r   r   rG   r   rJ  rc   minimummaximumr   r  rM  rF  rY  r^  r1  clip_by_valuer:  r#  rH  r   r   rf   r&  r*  )r/   r!  r%  r   r   rq  r   shape_list_posatt_spanscorepos_key_layerc2p_attr   pos_query_layerr_posp2c_posr)  r(  s                     r2   rh  z8TFDebertaDisentangledSelfAttention.disentangled_att_bias  s   ;''+A21j6K6KB6OPPL#L11~!##>".q*I*I1MMLL  A%%>,::LL  A%%gRUVdReRegghhh7J
:k2226
98M8Mb8QRRTXTo  H	
 
 46ADD_bjDjjlmlmlmmnpq
 
  D%%% MM.99M 55mDDMiR\--V-VWWG&|h'>8a<RSCSTTG"7,>wUa,b,bdfggGWE D%%%"oon==O"77HHOrw||
?33B7,FdN`aaa    O +&&r*j.C.CB.GGG/
90E0Eb0I:V_K`K`acKdee$&v'8!X\A=MNNGi	2<+V+VWWGlW&8+y&Y&Y[]^^`l`l`l G +&&r*j.C.CB.GGGN<111aaa
+CRHH	&w0B9gW`0a0aceffWEr3   rL   r@   )rU  r\   r=   r\   r   r   )
rO   rP   rQ   rj   r(   rI   rY  r<   rh  rS   rT   s   @r2   r   r     s         7 7 7 7 7 7rA A A A@7 7 7 7 *.)-+/"'m m m m m^7 7 7 7 7 7 7r3   r   c                  @     e Zd ZdZ fdZddZ	 	 	 	 	 	 dddZ xZS )TFDebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                      j        di | || _        t          |d|j                  | _        |j        | _        |j        | _        t          |dd          | _        |j        | _        | j        |j        k    r?t          j
                            |j        t          |j                  dd          | _        t          j
                            |j        d          | _        t#          |j        d	
          | _        d S )Nembedding_sizeposition_biased_inputT
embed_projFr?  r   r   r%   r#   r&   )r'   r(   r    rF   rA   r  r   r  r   r   r)   r*   r   r  r   r   r   r,   r   r%   r.   s      r2   r(   zTFDebertaEmbeddings.__init__5  s    ""6"""%f.>@RSS!-'-'E$%,V5Ld%S%S"!'!9&"444#l00"#263K#L#L!	 1  DO 88AV]h8ii-f.HyYYYr3   Nc                h   t          j        d          5  |                     d| j        j        | j        gt          | j                            | _        d d d            n# 1 swxY w Y   t          j        d          5  | j        j	        dk    rA|                     d| j        j	        | j        gt          | j                            | _
        nd | _
        d d d            n# 1 swxY w Y   t          j        d          5  | j        r<|                     d| j        | j        gt          | j                            | _        nd | _        d d d            n# 1 swxY w Y   | j        rd S d| _        t!          | d	d           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t!          | d
d           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t!          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S )Nword_embeddingsr   r   token_type_embeddingsr   
embeddingsposition_embeddingsTr   r%   r  )rG   rH   r   r    
vocab_sizer  r   r   r   type_vocab_sizer  r  r   rA   r  rE   rF   r   r$   rI   r%   r  rJ   s     r2   rI   zTFDebertaEmbeddings.buildH  s>   ],-- 	 	//{-t/BC+D,BCC *  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ]233 	2 	2{*Q..-1__%;68KL /0F G G .= . .** .2*	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 ]011 	0 	0) 0+/??%79IJ /0F G G ,; , ,(( ,0(	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 : 	F
4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4t,,8t344 I I%%tT43F&GHHHI I I I I I I I I I I I I I I I I I 98sn   AA""A&)A&AC&&C*-C*AEE #E &(GG!GH;;H?H?5#J%%J),J)F	input_idsr   position_idstoken_type_idsinputs_embedsr]   r4   r5   r=   r\   c                $   ||t          d          |5t          || j        j                   t	          j        | j        |          }t          |          dd         }|t	          j        |d          }|0t	          j	        t	          j
        d|d                   d          }|}| j        r t	          j        | j        |          }	||	z  }| j        j        dk    r t	          j        | j        |          }
||
z  }| j        | j        k    r|                     |          }|                     |          }|t'          t          |                    t'          t          |                    k    ryt'          t          |                    d	k    r*t	          j        t	          j        |d
          d
          }t	          j        t	          j	        |d          | j                  }||z  }|                     ||          }|S )z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5Need to provide either `input_ids` or `input_embeds`.)paramsr3  rW   r   dimsvalue)startlimitr  rx  r   r   r_   r7   )rJ  r   r    r  rG   r2  r   r   fillr   r   r  r  r  r  r  rA   r  r   r   r   rc   rf   r%   )r/   r  r  r  r  r]   r4   rK   final_embeddingsposition_embedstoken_type_embedss              r2   r<   zTFDebertaEmbeddings.callq  s    !6TUUU *9dk6LMMMIT[)LLLM //4!W+Q???N>"(+b/*R*R*RYZ[[[L(% 	0 it/GQ]^^^O/;&** "	1KUc d d d 11$"222#/?@@>>*:;;:d##$$J7G,H,H(I(IIIz$''((A--:bjA&>&>&>QGGGDwr~d;;;4CUVVV/$6<<(88<LLr3   r@   )NNNNNF)r  r   r  r   r  r   r  r   r]   r   r4   r5   r=   r\   r   rT   s   @r2   r  r  2  s        QQZ Z Z Z Z&'I 'I 'I 'IV '+)-+/*.!%5  5  5  5  5  5  5  5  5 r3   r  c                  0     e Zd Zd
 fdZddZdd	Z xZS ) TFDebertaPredictionHeadTransformr    r   c                    t                      j        di | t          |d|j                  | _        t
          j                            | j        t          |j	                  d          | _
        t          |j        t                    rt          |j                  | _        n|j        | _        t
          j                            |j        d          | _        || _        d S )Nr  r"   r   r   r   r&   )r'   r(   rF   rA   r  r   r)   r*   r   r   r"   r   r   r   r   transform_act_fnr   r   r   r    r.   s      r2   r(   z)TFDebertaPredictionHeadTransform.__init__  s    ""6"""%f.>@RSS\''%.v/GHH ( 
 

 f'-- 	6$5f6G$H$HD!!$*$5D!88AV]h8iir3   r9   r\   r=   c                    |                      |          }|                     |          }|                     |          }|S r   )r"   r  r   r   s     r2   r<   z%TFDebertaPredictionHeadTransform.call  s?    

-
88--m<<}55r3   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j	        j                  5  | j	                            d d | j
        g           d d d            d S # 1 swxY w Y   d S d S )NTr"   r   )rE   rF   rG   rH   r"   r$   rI   r    rA   r   r  rJ   s     r2   rI   z&TFDebertaPredictionHeadTransform.build  s   : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4d++7t~233 H H$$dD$2E%FGGGH H H H H H H H H H H H H H H H H H 87s$    (A44A8;A8.#CC"%C"rL   r   r@   r   rT   s   @r2   r  r    sm             $   	H 	H 	H 	H 	H 	H 	H 	Hr3   r  c                  P     e Zd Zd fdZddZdd	ZddZddZddZddZ	 xZ
S )TFDebertaLMPredictionHeadr    r   input_embeddingskeras.layers.Layerc                     t                      j        di | || _        t          |d|j                  | _        t          |d          | _        || _        d S )Nr  	transformr#   r&   )	r'   r(   r    rF   rA   r  r  r  r  r/   r    r  r0   r1   s       r2   r(   z"TFDebertaLMPredictionHead.__init__  sc    ""6"""%f.>@RSS9&{SSS !1r3   Nc                @   |                      | j        j        fddd          | _        | j        rd S d| _        t          | dd           St          j        | j        j	                  5  | j        
                    d            d d d            d S # 1 swxY w Y   d S d S )NzerosTr   )r   r   	trainabler$   r  )r   r    r  r   rE   rF   rG   rH   r  r$   rI   rJ   s     r2   rI   zTFDebertaLMPredictionHead.build  s    OO4;+A*CQXdhouOvv	: 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + + + + + 87s   )BBBr=   c                    | j         S r@   )r  rB   s    r2   get_output_embeddingsz/TFDebertaLMPredictionHead.get_output_embeddings  s    $$r3   r  tf.Variablec                \    || j         _        t          |          d         | j         _        d S Nr   )r  r   r   r  r/   r  s     r2   set_output_embeddingsz/TFDebertaLMPredictionHead.set_output_embeddings  s+    ',$+5e+<+<Q+?(((r3   dict[str, tf.Variable]c                    d| j         iS )Nr   )r   rB   s    r2   get_biasz"TFDebertaLMPredictionHead.get_bias  s    	""r3   c                j    |d         | _         t          |d                   d         | j        _        d S )Nr   r   )r   r   r    r  r  s     r2   set_biasz"TFDebertaLMPredictionHead.set_bias  s.    &M	!+E&M!:!:1!=r3   r9   r\   c                j   |                      |          }t          |          d         }t          j        |d| j        g          }t          j        || j        j        d          }t          j        |d|| j        j	        g          }t          j
                            || j                  }|S )Nr   r   rW   rW  T)ar`  r]  )r  r   )r  r   rG   r  r  r^  r  r   r    r  nnbias_addr   )r/   r9   
seq_lengths      r2   r<   zTFDebertaLMPredictionHead.call  s    ]CC..q1

-DDW?XYYY	MT5J5Q_cddd
-JPTP[Pf?ghhh]KKr3   r    r   r  r  r@   r=   r  r  r  )r=   r  r   )rO   rP   rQ   r(   rI   r  r  r  r  r<   rS   rT   s   @r2   r  r    s        
1 
1 
1 
1 
1 
1+ + + +% % % %@ @ @ @# # # #> > > >       r3   r  c                  0     e Zd Zd fdZdd	ZddZ xZS )TFDebertaOnlyMLMHeadr    r   r  r  c                h     t                      j        di | t          ||d          | _        d S )Npredictionsr#   r&   )r'   r(   r  r  r  s       r2   r(   zTFDebertaOnlyMLMHead.__init__   s?    ""6"""4V=MTabbbr3   sequence_outputr\   r=   c                2    |                      |          }|S )Nr   )r  )r/   r  prediction_scoress      r2   r<   zTFDebertaOnlyMLMHead.call  s     ,,?,KK  r3   Nc                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  )rE   rF   rG   rH   r  r$   rI   rJ   s     r2   rI   zTFDebertaOnlyMLMHead.build	  s    : 	F
4--9t/455 - - &&t,,,- - - - - - - - - - - - - - - - - - :9    A((A,/A,r  )r  r\   r=   r\   r@   r   rT   s   @r2   r  r    sk        c c c c c c! ! ! !
- - - - - - - -r3   r  c                  n     e Zd ZeZd fdZddZdd	Zd
 Ze		 	 	 	 	 	 	 	 	 d d!d            Z
d"dZ xZS )#TFDebertaMainLayerr    r   c                     t                      j        di | || _        t          |d          | _        t          |d          | _        d S )Nr  r#   encoderr&   )r'   r(   r    r  r  r   r  r.   s      r2   r(   zTFDebertaMainLayer.__init__  sS    ""6"""-f<HHH'Y???r3   r=   r  c                    | j         S r@   )r  rB   s    r2   get_input_embeddingsz'TFDebertaMainLayer.get_input_embeddings  s
    r3   r  r  c                \    || j         _        t          |          d         | j         _        d S r  )r  r   r   r  r  s     r2   set_input_embeddingsz'TFDebertaMainLayer.set_input_embeddings!  s)    !&%/%6%6q%9"""r3   c                    t           )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        )NotImplementedError)r/   heads_to_prunes     r2   _prune_headszTFDebertaMainLayer._prune_heads%  s
    
 "!r3   NFr  TFModelInputType | Noner   np.ndarray | tf.Tensor | Noner  r  r  r   bool | Noner  r  r4   r5   r  c
                   ||t          d          |t          |          }
n)|t          |          d d         }
nt          d          |t          j        |
d          }|t          j        |
d          }|                     ||||||	          }|                     ||||||	          }|d         }|s|f|dd          z   S t          ||j        |j        	          S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timerW   z5You have to specify either input_ids or inputs_embedsr   r  r   )r  r  r  r  r]   r4   )r9   r   r   r  r  r4   r  )	rJ  r   rG   r  r  r  r   r9   r
  )r/   r  r   r  r  r  r   r  r  r4   rK   embedding_outputencoder_outputsr  s                 r2   r<   zTFDebertaMainLayer.call,  s<     ]%>cddd"$Y//KK&$]33CRC8KKTUUU!W+Q???N!W+Q???N??%)' + 
 
 ,,*)/!5# ' 
 
 *!, 	<#%(;;; -)7&1
 
 
 	
r3   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  r  )rE   rF   rG   rH   r  r$   rI   r  rJ   s     r2   rI   zTFDebertaMainLayer.builde  sS   : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) 65r   rL   r  r  	NNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r4   r5   r=   r  r@   )rO   rP   rQ   r   config_classr(   r  r  r  r   r<   rI   rS   rT   s   @r2   r  r    s         L@ @ @ @ @ @   : : : :" " "  .28<8<6:7;)-,0#'6
 6
 6
 6
 ]6
p	) 	) 	) 	) 	) 	) 	) 	)r3   r  c                      e Zd ZdZeZdZdS )TFDebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertaN)rO   rP   rQ   rj   r   r  base_model_prefixr&   r3   r2   r  r  q  s'         
 !L!r3   r  a9
  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFDebertaModelr    r   c                n     t                      j        |g|R i | t          |d          | _        d S )Nr  r#   )r'   r(   r  r  r/   r    r[   r0   r1   s       r2   r(   zTFDebertaModel.__init__  sB    3&333F333)&yAAAr3   batch_size, sequence_length
checkpointoutput_typer  NFr  r  r   r  r  r  r  r   r  r  r  r4   r=   r  c
                B    |                      |||||||||		  	        }
|
S )N	r  r   r  r  r  r   r  r  r4   )r  )r/   r  r   r  r  r  r   r  r  r4   r   s              r2   r<   zTFDebertaModel.call  s>    & ,,))%'/!5#  

 

 r3   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  )rE   rF   rG   rH   r  r$   rI   rJ   s     r2   rI   zTFDebertaModel.build  s    : 	F
4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) 65r  rL   r  )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r4   r  r=   r  r@   )rO   rP   rQ   r(   r   r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr<   rI   rS   rT   s   @r2   r  r    s        
B B B B B B
 **+C+J+JKh+i+ijj&%$   .28<8<6:7;)-,0#' %     kj ]4) ) ) ) ) ) ) )r3   r  z5DeBERTa Model with a `language modeling` head on top.c                       e Zd Zd fdZddZe ee                    d                     e	e
ee          	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS ) TFDebertaForMaskedLMr    r   c                     t                      j        |g|R i | |j        rt                              d           t          |d          | _        t          || j        j        d          | _	        d S )NzpIf you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.r  r#   cls)r  r$   )
r'   r(   
is_decoderloggerwarningr  r  r  r  mlmr  s       r2   r(   zTFDebertaForMaskedLM.__init__  s    3&333F333 	NN1  
 *&yAAA'AX_deeer3   r=   r  c                    | j         j        S r@   )r  r  rB   s    r2   get_lm_headz TFDebertaForMaskedLM.get_lm_head  s    x##r3   r  r  NFr  r  r   r  r  r  r  r   r  r  r  labelsr4   #TFMaskedLMOutput | tuple[tf.Tensor]c                    |                      |||||||||
	  	        }|d         }|                     ||
          }|	dn|                     |	|          }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        r  r   )r  r4   Nr  logitsr   lossr  r9   r
  )r  r  hf_compute_lossr   r9   r
  )r/   r  r   r  r  r  r   r  r  r  r4   r   r  r  r  ri   s                   r2   r<   zTFDebertaForMaskedLM.call  s    4 ,,))%'/!5#  

 

 "!* HH_xHXX~tt4+?+?vVg+?+h+h 	F')GABBK7F)-)9TGf$$vE$!/)	
 
 
 	
r3   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  r  )rE   rF   rG   rH   r  r$   rI   r  rJ   s     r2   rI   zTFDebertaForMaskedLM.buildJ  sP   : 	F
4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4%%1tx}-- % %t$$$% % % % % % % % % % % % % % % % % % 21r   rL   r  
NNNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r4   r  r=   r   r@   )rO   rP   rQ   r(   r  r   r   r  r  r   r  r   r  r<   rI   rS   rT   s   @r2   r  r    s        
f 
f 
f 
f 
f 
f$ $ $ $ **+C+J+JKh+i+ijj&$$   .28<8<6:7;)-,0#'04 %+
 +
 +
 +
  kj ]+
Z	% 	% 	% 	% 	% 	% 	% 	%r3   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Zd fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )"TFDebertaForSequenceClassificationr    r   c                    t                      j        |g|R i | |j        | _        t          |d          | _        t          |d          | _        t          |dd           }|| j        j	        n|}t          |d          | _        t          j                            |j        t          |j                  d          | _        | j        j        | _        d S )Nr  r#   poolercls_dropout
classifierr   )r'   r(   
num_labelsr  r  r   r  rF   r    r   r,   r%   r   r)   r*   r   r   r  rC   )r/   r    r[   r0   drop_outr1   s        r2   r(   z+TFDebertaForSequenceClassification.__init__^  s    3&333F333 +)&yAAA,V(CCC6=$776>6F4;22H-h]KKK,,,#.v/GHH - 
 

 +0r3   r  r  NFr  r  r   r  r  r  r  r   r  r  r  r  r4   r=   -TFSequenceClassifierOutput | tuple[tf.Tensor]c                x   |                      |||||||||
	  	        }|d         }|                     ||
          }|                     ||
          }|                     |          }|	dn|                     |	|          }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r  r   r7   Nr  r   r  )r  r  r%   r  r  r
   r9   r
  )r/   r  r   r  r  r  r   r  r  r  r4   r   r  r;   r  r  ri   s                    r2   r<   z'TFDebertaForSequenceClassification.callp  s    4 ,,))%'/!5#  

 

 "!*OhGG]XFF//~tt4+?+?vV\+?+]+] 	FY,F)-)9TGf$$vE)!/)	
 
 
 	
r3   c                D   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           [t          j        | j	        j                  5  | j	                            d d | j
        g           d d d            d S # 1 swxY w Y   d S d S )NTr  r  r%   r  )rE   rF   rG   rH   r  r$   rI   r  r%   r  rC   rJ   s     r2   rI   z(TFDebertaForSequenceClassification.build  s   : 	F
4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )44((4t{/00 ( (!!$'''( ( ( ( ( ( ( ( ( ( ( ( ( ( (4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4t,,8t344 E E%%tT4?&CDDDE E E E E E E E E E E E E E E E E E 98sH    A''A+.A+!CCCD))D-0D-##FFFrL   r  )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r4   r  r=   r  r@   )rO   rP   rQ   r(   r   r   r  r  r   r  r
   r  r<   rI   rS   rT   s   @r2   r
  r
  V  s        1 1 1 1 1 1$ **+C+J+JKh+i+ijj&.$   .28<8<6:7;)-,0#'04 %.
 .
 .
 .
  kj ].
`E E E E E E E Er3   r
  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Zd fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFDebertaForTokenClassificationr    r   c                d    t                      j        |g|R i | |j        | _        t          |d          | _        t
          j                            |j                  | _	        t
          j        
                    |j        t          |j                  d          | _        || _        d S )Nr  r#   )rater  r   )r'   r(   r  r  r  r   r)   Dropoutr   r%   r*   r   r   r  r    r  s       r2   r(   z(TFDebertaForTokenClassification.__init__  s    3&333F333 +)&yAAA|++1K+LL,,,#H`8a8aht - 
 
 r3   r  r  NFr  r  r   r  r  r  r  r   r  r  r  r  r4   r=   *TFTokenClassifierOutput | tuple[tf.Tensor]c                L   |                      |||||||||
	  	        }|d         }|                     ||
          }|                     |          }|	dn|                     |	|          }|s|f|dd         z   }||f|z   n|S t	          |||j        |j                  S )	z
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   r7   r   Nr  r   r  )r  r%   r  r  r   r9   r
  )r/   r  r   r  r  r  r   r  r  r  r4   r   r  r  r  ri   s                   r2   r<   z$TFDebertaForTokenClassification.call  s    0 ,,))%'/!5#  

 

 "!*,,,JJ88~tt4+?+?vV\+?+]+] 	FY,F)-)9TGf$$vE&!/)	
 
 
 	
r3   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j	        g           d d d            d S # 1 swxY w Y   d S d S )NTr  r  )
rE   rF   rG   rH   r  r$   rI   r  r    rA   rJ   s     r2   rI   z%TFDebertaForTokenClassification.build   s   : 	F
4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4t,,8t344 M M%%tT4;3J&KLLLM M M M M M M M M M M M M M M M M M 98$    A''A+.A+!(CCCrL   r  )r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r4   r  r=   r  r@   )rO   rP   rQ   r(   r   r   r  r  r   r  r   r  r<   rI   rS   rT   s   @r2   r  r    s        
 
 
 
 
 
 **+C+J+JKh+i+ijj&+$   .28<8<6:7;)-,0#'04 %*
 *
 *
 *
  kj ]*
X	M 	M 	M 	M 	M 	M 	M 	Mr3   r  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Zd fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFDebertaForQuestionAnsweringr    r   c                    t                      j        |g|R i | |j        | _        t          |d          | _        t
          j                            |j        t          |j	                  d          | _
        || _        d S )Nr  r#   
qa_outputsr   )r'   r(   r  r  r  r   r)   r*   r   r   r!  r    r  s       r2   r(   z&TFDebertaForQuestionAnswering.__init__  s    3&333F333 +)&yAAA,,,#H`8a8aht - 
 
 r3   r  r  NFr  r  r   r  r  r  r  r   r  r  r  start_positionsend_positionsr4   r=   1TFQuestionAnsweringModelOutput | tuple[tf.Tensor]c                   |                      |||||||||	  	        }|d         }|                     |          }t          j        |dd          \  }}t          j        |d          }t          j        |d          }d}|	$|
"d	|	i}|
|d
<   |                     |||f          }|s||f|dd         z   }||f|z   n|S t          ||||j        |j                  S )a  
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        r  r   r   r   rW   )r  r[  rZ   )inputrZ   Nstart_positionend_positionr  )r  start_logits
end_logitsr9   r
  )	r  r!  rG   rd  r   r  r	   r9   r
  )r/   r  r   r  r  r  r   r  r  r"  r#  r4   r   r  r  r)  r*  r  r  ri   s                       r2   r<   z"TFDebertaForQuestionAnswering.call  s=   > ,,))%'/!5#  

 

 "!*88#%8&QUW#X#X#X jz2>>>Zjr:::
&=+D&8F%2F>"''v|Z>X'YYD 	F"J/'!""+=F)-)9TGf$$vE-%!!/)
 
 
 	
r3   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j	        g           d d d            d S # 1 swxY w Y   d S d S )NTr  r!  )
rE   rF   rG   rH   r  r$   rI   r!  r    rA   rJ   s     r2   rI   z#TFDebertaForQuestionAnswering.builda  r  r  rL   )NNNNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  r  r"  r  r#  r  r4   r  r=   r$  r@   )rO   rP   rQ   r(   r   r   r  r  r   r  r	   r  r<   rI   rS   rT   s   @r2   r  r    s        	 	 	 	 	 	 **+C+J+JKh+i+ijj&2$   .28<8<6:7;)-,0#'9=7; %9
 9
 9
 9
  kj ]9
v	M 	M 	M 	M 	M 	M 	M 	Mr3   r  )r  r  r
  r  r  r  )Lrj   
__future__r   r   collections.abcr   numpynp
tensorflowrG   activations_tfr   modeling_tf_outputsr   r   r	   r
   r   modeling_tf_utilsr   r   r   r   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   configuration_debertar   
get_loggerrO   r  r  r  r)   Layerr   rV   r,   r   r   r   r   r   r   r   r   r#  r&  r*  r:  r   r  r  r  r  r  r  DEBERTA_START_DOCSTRINGr  r  r  r
  r  r  __all__r&   r3   r2   <module>r;     s)     " " " " " "  $ $ $ $ $ $         / / / / / /             
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 S R R R R R R R R R u u u u u u u u u u u u 0 0 0 0 0 0 
	H	%	% ". ) ) ) ) )U\/ ) ) )@    *   ,% % % % %U\/ % % %P9 9 9 9 9+ 9 9 9() ) ) ) )%,, ) ) ):-. -. -. -. -.+ -. -. -.`H H H H HEL. H H H:) ) ) ) )el( ) ) )B0- 0- 0- 0- 0-U\' 0- 0- 0-fi
 i
 i
 i
 i
u|) i
 i
 i
X* * *0, , ,, , ,. . .
  0R R R R R); R R Rjt  t  t  t  t %,, t  t  t n#H #H #H #H #Hu|'9 #H #H #HL- - - - - 2 - - -`- - - - -5<- - - -([) [) [) [) [)+ [) [) [)|" " " " "0 " " "( T) X g -) -) -) -) -)- -) -)	 -)` QSjkkM% M% M% M% M%35Q M% M% lkM%`   YE YE YE YE YE)AC_ YE YE YEx   IM IM IM IM IM&>@Y IM IM IMX   WM WM WM WM WM$<>U WM WM WMt  r3   