
     `i/                    P   d Z ddlmZ ddlZddlZddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZm Z  dd	l!m"Z"m#Z#m$Z$m%Z% d
dl&m'Z'  e%j(        e)          Z*dZ+dZ, G d dej-        j.                  Z/ G d dej-        j.                  Z0 G d dej-        j.                  Z1 G d dej-        j.                  Z2 G d dej-        j.                  Z3 G d dej-        j.                  Z4 G d dej-        j.                  Z5 G d dej-        j.                  Z6 G d dej-        j.                  Z7 G d  d!ej-        j.                  Z8e G d" d#ej-        j.                              Z9 G d$ d%e          Z:d&Z;d'Z< e#d(e;           G d) d*e:                      Z= G d+ d,ej-        j.                  Z> G d- d.ej-        j.                  Z? e#d/e;           G d0 d1e:e                      Z@ G d2 d3ej-        j.                  ZA e#d4e;           G d5 d6e:e                      ZB e#d7e;           G d8 d9e:e                      ZC e#d:e;           G d; d<e:e                      ZD e#d=e;           G d> d?e:e                      ZEg d@ZFdS )AzTF 2.0 ConvBERT model.    )annotationsN   )get_tf_activation)TFBaseModelOutputTFMaskedLMOutputTFMultipleChoiceModelOutputTFQuestionAnsweringModelOutputTFSequenceClassifierOutputTFTokenClassifierOutput)TFMaskedLanguageModelingLossTFModelInputTypeTFMultipleChoiceLossTFPreTrainedModelTFQuestionAnsweringLossTFSequenceClassificationLossTFSequenceSummaryTFTokenClassificationLossget_initializerkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ConvBertConfigzYituTech/conv-bert-baser    c                  B     e Zd ZdZd fdZddZ	 	 	 	 	 	 dddZ xZS )TFConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configr    c                :    t                      j        di | || _        |j        | _        |j        | _        |j        | _        t          j                            |j	        d          | _
        t          j                            |j                  | _        d S )N	LayerNormepsilonname)rate )super__init__r#   embedding_sizemax_position_embeddingsinitializer_ranger   layersLayerNormalizationlayer_norm_epsr%   Dropouthidden_dropout_probdropoutselfr#   kwargs	__class__s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/convbert/modeling_tf_convbert.pyr,   zTFConvBertEmbeddings.__init__A   s    ""6"""$3'-'E$!'!988AV]h8ii|++1K+LL    Nc                   t          j        d          5  |                     d| j        j        | j        gt          | j                            | _        d d d            n# 1 swxY w Y   t          j        d          5  |                     d| j        j	        | j        gt          | j                            | _
        d d d            n# 1 swxY w Y   t          j        d          5  |                     d| j        | j        gt          | j                            | _        d d d            n# 1 swxY w Y   | j        rd S d| _        t          | dd           `t          j        | j        j                  5  | j                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )	Nword_embeddingsweight)r(   shapeinitializertoken_type_embeddings
embeddingsposition_embeddingsTr%   )tf
name_scope
add_weightr#   
vocab_sizer-   r   r/   r>   type_vocab_sizerA   r.   rC   builtgetattrr%   r(   buildr7   input_shapes     r:   rK   zTFConvBertEmbeddings.buildK   s   ],-- 	 	//{-t/BC+D,BCC *  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ]233 	 	)-!{2D4GH+D,BCC *9 * *D&	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ]011 	 	'+!3T5HI+D,BCC (7 ( (D$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 : 	F
4d++7t~233 O O$$dD$+2L%MNNNO O O O O O O O O O O O O O O O O O 87sI   AA""A&)A&ACCC-<D55D9<D9?(F44F8;F8r   F	input_idstf.Tensor | Noneposition_idstoken_type_idsinputs_embedstrainingboolreturn	tf.Tensorc                8   ||t          d          |5t          || j        j                   t	          j        | j        |          }t          |          dd         }|t	          j        |d          }|3t	          j	        t	          j
        ||d         |z             d	          }t	          j        | j        |          }t	          j        | j        |          }	||z   |	z   }
|                     |

          }
|                     |
|          }
|
S )z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5Need to provide either `input_ids` or `input_embeds`.)paramsindicesr   )dimsvaluer   )startlimitaxis)inputs)ra   rS   )
ValueErrorr   r#   rG   rD   gatherr>   r   fillexpand_dimsrangerC   rA   r%   r5   )r7   rN   rP   rQ   rR   past_key_values_lengthrS   rM   position_embedstoken_type_embedsfinal_embeddingss              r:   callzTFConvBertEmbeddings.calli   s0    !6TUUU *9dk6LMMMIT[)LLLM //4!W+Q???N>5[^Nd=deeelm  L )4+C\ZZZIT-GQ_```(?:=NN>>1A>BB<</?(<SSr;   )r#   r    N)NNNNr   F)rN   rO   rP   rO   rQ   rO   rR   rO   rS   rT   rU   rV   )__name__
__module____qualname____doc__r,   rK   rk   __classcell__r9   s   @r:   r"   r"   >   s        QQM M M M M MO O O O@ '+)-+/*. &  &  &  &  &  &  &  &  & r;   r"   c                  4     e Zd Z fdZd ZddZddZ xZS )	TFConvBertSelfAttentionc           
     p    t                      j        di | |j        |j        z  dk    r t	          d|j         d|j         d          t          |j        |j        z            }|dk     r|j        | _        d}n|}|j        | _        || _        |j        | _        |j        | j        z  dk    rt	          d          |j        |j        z  | _        | j        | j        z  | _	        t          j                            | j	        t          |j                  d          | _        t          j                            | j	        t          |j                  d	          | _        t          j                            | j	        t          |j                  d
          | _        t          j                            | j	        | j        dd t          d| j        z            t          |j                  d          | _        t          j                            | j        | j        z  d dt          |j                            | _        t          j                            | j	        d dt          |j                            | _        t          j                            |j                  | _        || _        d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsquerykernel_initializerr(   keyr\   samekey_conv_attn_layer)padding
activationdepthwise_initializerpointwise_initializerr(   conv_kernel_layer)r~   r(   ry   conv_out_layerr*   )r+   r,   hidden_sizenum_attention_headsrb   int
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   r0   Denser   r/   rw   rz   r\   SeparableConv1Dr|   r   r   r3   attention_probs_dropout_probr5   r#   )r7   r#   r8   new_num_attention_headsr   r9   s        r:   r,   z TFConvBertSelfAttention.__init__   s   ""6""" ::a??8F$6 8 8 48 8 8  
 #&f&@6CT&T"U"U"Q&&$8DO"#"9$/DO#6  & 7 88A==UVVV#)#59S#S !58PP\''?6C[3\3\cj ( 
 

 <%%?6C[3\3\ch & 
 
 \''?6C[3\3\cj ( 
 

 $)<#?#?!"1!d6K2K"L"L"1&2J"K"K& $@ $
 $
  "'!3!3$t'<<$.v/GHH	 "4 "
 "
 $l00!.v/GHH	 1 
 
 |++F,OPPr;   c                x    t          j        ||d| j        | j        f          }t          j        |g d          S )NrZ   r      r   r   perm)rD   reshaper   r   	transpose)r7   x
batch_sizes      r:   transpose_for_scoresz,TFConvBertSelfAttention.transpose_for_scores   s;    Jq:r4+CTE]^__|ALLL1111r;   Fc                    t          |          d                              |                               |          }                     |          }                     |          }                               }	                     |          }
t          j        |          }                     |          }t          j	        |d j
        dg          }t          |d          }t          j        ddgt           j
        dz
  dz            t           j
        dz
  dz            gddgg          }                     |          t          j	        d j        g          t          j        |d          t          j         fdt%           j
                  D             d          }t          j	        |d j         j
        g          t          j        |          t          j	        d j        g          t          j        |	|
d	          }t          j        t          |
          d         |j                  }|t          j                            |          z  }|||z   }t          |d          }                     ||
          }|||z  }t          j	        |d j         j        g          }t          j        |g d          }t          j        ||          }t          j        |g d          }t          j	        d j         j        g          }t          j        ||gd          }t          j	        |d j         j        z  f          }|r||fn|f}|S )Nr   rZ   r   r_   r   CONSTANTc           
     x    g | ]6}t          j        d |d gt                    d         j        g          7S )r   r   )rD   slicer   r   ).0ir   r   mixed_query_layerr7   s     r:   
<listcomp>z0TFConvBertSelfAttention.call.<locals>.<listcomp>   sW        !QZL]A^A^_`Aacgcu4vww  r;   T)transpose_brS   r   r   )r   rw   rz   r\   r|   r   rD   multiplyr   r   r   r   constantr   r   r   padstackrf   r   matmulcastdtypemathsqrtr5   r   r   concatr   )r7   hidden_statesattention_mask	head_maskoutput_attentionsrS   mixed_key_layermixed_value_layermixed_key_conv_attn_layerquery_layer	key_layerconv_attn_layerr   paddingsunfold_conv_out_layerattention_scoresdkattention_probsvalue_layercontext_layerconv_outoutputsr   r   r   s   `                     @@@r:   rk   zTFConvBertSelfAttention.call   s   ..q1
 JJ}55((=11 JJ}55$($<$<]$K$K!//0A:NN--ozJJ	+&?ARSS 22?CCJ'82t?TVW:XYY*+<1EEE;  d+a/1455sD<QTU<UYZ;Z7[7[\A	
 	
 ,,];;NZTEW4XYY*EE "      t455   !
 !
 !
 $9B@XZ^Zo;pqq>3DEENR9K4LMM 9
 
 
 WZ	**2.0@0FGG+bgll2.>.>>%/.@ ))9CCC ,,,JJ  -	9Oj
B0H$Jbc
 
 l;==	/;??]FFF:nz2t?WY]Yq.rss	=(";Q??
JDOd>P,PQ
 
 7H]=/22mM]r;   Nc                J   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j	        j                  5  | j	                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j
        j                  5  | j
                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )NTrw   rz   r\   r|   r   r   )rI   rJ   rD   rE   rw   r(   rK   r#   r   rz   r\   r|   r   r   r   rL   s     r:   rK   zTFConvBertSelfAttention.build)  sN   : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4%%1tx}-- F FdDK,CDEEEF F F F F F F F F F F F F F F4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4.55At7<== V V(..dDK<S/TUUUV V V V V V V V V V V V V V V4,d33?t5:;; O O&,,dD$:L-MNNNO O O O O O O O O O O O O O O4)400<t2788 Q Q#))4t{7N*OPPPQ Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q =<sl    (A44A8;A8.(C""C&)C&(EEE
(F>>GG8#H''H+.H+!(JJJFrl   )rm   rn   ro   r,   r   rk   rK   rq   rr   s   @r:   rt   rt      s}        < < < < <|2 2 2
Q Q Q QfQ Q Q Q Q Q Q Qr;   rt   c                  .     e Zd Z fdZddZddZ xZS )TFConvBertSelfOutputc                l    t                      j        di | t          j                            |j        t          |j                  d          | _        t          j        	                    |j
        d          | _        t          j                            |j                  | _        || _        d S Ndenserx   r%   r&   r*   )r+   r,   r   r0   r   r   r   r/   r   r1   r2   r%   r3   r4   r5   r#   r6   s      r:   r,   zTFConvBertSelfOutput.__init__B  s    ""6"""\''?6C[3\3\cj ( 
 

 88AV]h8ii|++F,FGGr;   Fc                    |                      |          }|                     ||          }|                     ||z             }|S Nr   r   r5   r%   r7   r   input_tensorrS   s       r:   rk   zTFConvBertSelfOutput.callL  D    

=11]XFF}|'CDDr;   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j	        j                  5  | j	                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S NTr   r%   
rI   rJ   rD   rE   r   r(   rK   r#   r   r%   rL   s     r:   rK   zTFConvBertSelfOutput.buildS     : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L L L L 87$    (A44A8;A8.(C##C'*C'r   rl   rm   rn   ro   r,   rk   rK   rq   rr   s   @r:   r   r   A  sh               	L 	L 	L 	L 	L 	L 	L 	Lr;   r   c                  4     e Zd Z fdZd ZddZddZ xZS )	TFConvBertAttentionc                     t                      j        di | t          |d          | _        t	          |d          | _        d S )Nr7   r(   outputr*   )r+   r,   rt   self_attentionr   dense_outputr6   s      r:   r,   zTFConvBertAttention.__init__`  sP    ""6"""5f6JJJ0hGGGr;   c                    t           rl   NotImplementedError)r7   headss     r:   prune_headszTFConvBertAttention.prune_headsf  s    !!r;   Fc                    |                      |||||          }|                     |d         ||          }|f|dd          z   }|S Nr   r   r   )r   r   )	r7   r   r   r   r   rS   self_outputsattention_outputr   s	            r:   rk   zTFConvBertAttention.calli  sf    **.)5FQY + 
 
  ,,\!_lU],^^#%QRR(88r;   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr   r   )rI   rJ   rD   rE   r   r(   rK   r   rL   s     r:   rK   zTFConvBertAttention.buildr  sX   : 	F
4)400<t2788 0 0#))$///0 0 0 0 0 0 0 0 0 0 0 0 0 0 04..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:$    A''A+.A+!C		CCr   rl   )rm   rn   ro   r,   r   rk   rK   rq   rr   s   @r:   r   r   _  st        H H H H H" " "   	. 	. 	. 	. 	. 	. 	. 	.r;   r   c                  0     e Zd Z fdZd fd	Zd Z xZS )GroupedLinearLayerc                     t                      j        di | || _        || _        || _        || _        | j        | j        z  | _        | j        | j        z  | _        d S Nr*   )r+   r,   
input_sizeoutput_size
num_groupsry   group_in_dimgroup_out_dim)r7   r   r   r   ry   r8   r9   s         r:   r,   zGroupedLinearLayer.__init__  sg    ""6"""$&$"4 Ot>!-@r;   Nc                   |                      d| j        | j        | j        g| j        d          | _        |                      d| j        g| j        | j        d          | _        t                      
                    |           d S )NkernelT)r?   r@   	trainablebias)r?   r@   r   r   )rF   r   r   r   ry   r   r   r   r   r+   rK   r7   rM   r9   s     r:   rK   zGroupedLinearLayer.build  s    oo%t'8$/J/	 & 
 
 OO4+,$:QY]Ycos $ 
 
	 	k"""""r;   c                   t          |          d         }t          j        t          j        |d| j        | j        g          g d          }t          j        |t          j        | j        g d                    }t          j        |g d          }t          j        ||d| j        g          }t          j	        
                    || j                  }|S )Nr   rZ   )r   r   r   )r   r   r   r\   r   )r   rD   r   r   r   r   r   r   r   nnbias_addr   )r7   r   r   r   s       r:   rk   zGroupedLinearLayer.call  s    ..q1
LMBIZ3[\\^g^g^ghhIadk999==>>LIII&&Jq:r4+;<==ENNN33r;   rl   )rm   rn   ro   r,   rK   rk   rq   rr   s   @r:   r   r   ~  sj        A A A A A# # # # # #      r;   r   c                  ,     e Zd Z fdZd ZddZ xZS )TFConvBertIntermediatec                    t                      j        di | |j        dk    r?t          j                            |j        t          |j                  d          | _	        n:t          |j        |j        |j        t          |j                  d          | _	        t          |j        t                    rt          |j                  | _        n|j        | _        || _        d S )Nr   r   rx   r   ry   r(   r*   )r+   r,   r   r   r0   r   intermediate_sizer   r/   r   r   r   
isinstance
hidden_actstrr   intermediate_act_fnr#   r6   s      r:   r,   zTFConvBertIntermediate.__init__  s    ""6"""!!++(_VMe=f=fmt ,  DJJ ,"(!,#263K#L#L  DJ f'-- 	9'89J'K'KD$$'-'8D$r;   c                Z    |                      |          }|                     |          }|S rl   )r   r   r7   r   s     r:   rk   zTFConvBertIntermediate.call  s,    

=1100??r;   Nc                   | j         rd S d| _         t          | dd           `t          j        | j        j                  5  | j                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   )	rI   rJ   rD   rE   r   r(   rK   r#   r   rL   s     r:   rK   zTFConvBertIntermediate.build  s    : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H H H H 43s    (A55A9<A9rl   r   rr   s   @r:   r   r     sc            *  H H H H H H H Hr;   r   c                  .     e Zd Z fdZddZddZ xZS )TFConvBertOutputc                    t                      j        di | |j        dk    r?t          j                            |j        t          |j                  d          | _	        n:t          |j        |j        |j        t          |j                  d          | _	        t          j                            |j        d          | _        t          j                            |j                  | _        || _        d S )Nr   r   rx   r   r%   r&   r*   )r+   r,   r   r   r0   r   r   r   r/   r   r   r   r1   r2   r%   r3   r4   r5   r#   r6   s      r:   r,   zTFConvBertOutput.__init__  s    ""6"""!!++"vG_7`7`gn ,  DJJ ,("!,#263K#L#L  DJ 88AV]h8ii|++F,FGGr;   Fc                    |                      |          }|                     ||          }|                     ||z             }|S r   r   r   s       r:   rk   zTFConvBertOutput.call  r   r;   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j	        j                  5  | j	                            d d | j        j
        g           d d d            d S # 1 swxY w Y   d S d S NTr%   r   )rI   rJ   rD   rE   r%   r(   rK   r#   r   r   r   rL   s     r:   rK   zTFConvBertOutput.build  s   : 	F
4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L4$''3tz// N N
  $dk.K!LMMMN N N N N N N N N N N N N N N N N N 43r   r   rl   r   rr   s   @r:   r  r    sh            &   	N 	N 	N 	N 	N 	N 	N 	Nr;   r  c                  .     e Zd Z fdZddZddZ xZS )TFConvBertLayerc                     t                      j        di | t          |d          | _        t	          |d          | _        t          |d          | _        d S )N	attentionr   intermediater   r*   )r+   r,   r   r  r   r  r  bert_outputr6   s      r:   r,   zTFConvBertLayer.__init__  sd    ""6""",V+FFF26OOO+FBBBr;   Fc                    |                      |||||          }|d         }|                     |          }|                     |||          }	|	f|dd          z   }
|
S r   )r  r  r  )r7   r   r   r   r   rS   attention_outputsr   intermediate_outputlayer_outputr   s              r:   rk   zTFConvBertLayer.call  s     NN>96GRZ + 
 
 -Q/"//0@AA''(;=MX`'aa/$5abb$99r;   Nc                r   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  r  r  )	rI   rJ   rD   rE   r  r(   rK   r  r  rL   s     r:   rK   zTFConvBertLayer.build  s   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4..:t0566 . .!''---. . . . . . . . . . . . . . .4--9t/455 - - &&t,,,- - - - - - - - - - - - - - - - - - :96    A''A+.A+!CCCD**D.1D.r   rl   r   rr   s   @r:   r	  r	    se        C C C C C	 	 	 	- - - - - - - -r;   r	  c                  0     e Zd Z fdZ	 ddZddZ xZS )TFConvBertEncoderc                     t                      j        di | fdt          j                  D             | _        d S )Nc                8    g | ]}t          d |           S )zlayer_._r   )r	  )r   r   r#   s     r:   r   z.TFConvBertEncoder.__init__.<locals>.<listcomp>  s,    lllqof>a>>BBBlllr;   r*   )r+   r,   rf   num_hidden_layerslayerr6   s    ` r:   r,   zTFConvBertEncoder.__init__  sL    ""6"""llllERXRjLkLklll


r;   Fc                &   |rdnd }|rdnd }	t          | j                  D ]9\  }
}|r||fz   } |||||
         ||          }|d         }|r|	|d         fz   }	:|r||fz   }|st          d |||	fD                       S t          |||	          S )Nr*   r   r   r   c              3     K   | ]}||V  	d S rl   r*   )r   vs     r:   	<genexpr>z)TFConvBertEncoder.call.<locals>.<genexpr>1  s(      hhqZ[ZgZgZgZgZghhr;   )last_hidden_stater   
attentions)	enumerater  tupler   )r7   r   r   r   r   output_hidden_statesreturn_dictrS   all_hidden_statesall_attentionsr   layer_modulelayer_outputss                r:   rk   zTFConvBertEncoder.call  s    #7@BBD0:d(44 
	F 
	FOA|# I$58H$H!(L~y|=NYa  M *!,M  F!/=3C2E!E   	E 1]4D D 	ihh]4E~$Vhhhhhh +;LYg
 
 
 	
r;   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTr  )rI   rJ   r  rD   rE   r(   rK   )r7   rM   r  s      r:   rK   zTFConvBertEncoder.build7  s    : 	F
4$''3 & &]5:.. & &KK%%%& & & & & & & & & & & & & & & 43& &s   A&&A*	-A*	r   rl   r   rr   s   @r:   r  r    sk        m m m m m "
 "
 "
 "
H& & & & & & & &r;   r  c                  ,     e Zd Z fdZd ZddZ xZS )!TFConvBertPredictionHeadTransformc                    t                      j        di | t          j                            |j        t          |j                  d          | _        t          |j
        t                    rt          |j
                  | _        n|j
        | _        t          j                            |j        d          | _        || _        d S r   )r+   r,   r   r0   r   r-   r   r/   r   r   r   r   r   transform_act_fnr1   r2   r%   r#   r6   s      r:   r,   z*TFConvBertPredictionHeadTransform.__init__B  s    ""6"""\''!ofF^6_6_fm ( 
 

 f'-- 	6$5f6G$H$HD!!$*$5D!88AV]h8iir;   c                    |                      |          }|                     |          }|                     |          }|S rl   )r   r,  r%   r   s     r:   rk   z&TFConvBertPredictionHeadTransform.callQ  s=    

=11--m<<}55r;   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j	        j                  5  | j	                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S r   r   rL   s     r:   rK   z'TFConvBertPredictionHeadTransform.buildX  r   r   rl   r   rr   s   @r:   r*  r*  A  sc              	L 	L 	L 	L 	L 	L 	L 	Lr;   r*  c                  t     e Zd ZeZ fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 dd	            Zdd
Z xZS )TFConvBertMainLayerc                    t                      j        di | t          |d          | _        |j        |j        k    r+t          j                            |j        d          | _	        t          |d          | _        || _        d S )NrB   r   embeddings_projectencoderr*   )r+   r,   r"   rB   r-   r   r   r0   r   r2  r  r3  r#   r6   s      r:   r,   zTFConvBertMainLayer.__init__h  s    ""6""".vLIII F$666&+l&8&89KRf&8&g&gD#(i@@@r;   c                    | j         S rl   )rB   r7   s    r:   get_input_embeddingsz(TFConvBertMainLayer.get_input_embeddingss  s
    r;   c                L    || j         _        |j        d         | j         _        d S Nr   )rB   r>   r?   rG   r7   r\   s     r:   set_input_embeddingsz(TFConvBertMainLayer.set_input_embeddingsv  s"    !&%*[^"""r;   c                    t           )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        r   )r7   heads_to_prunes     r:   _prune_headsz TFConvBertMainLayer._prune_headsz  s
    
 "!r;   c                    |t          j        |d          }t          j        ||d         dd|d         f          }t          j        ||          }d|z
  dz  }|S )Nr   r   g      ?g     )rD   rd   r   r   )r7   r   rM   r   extended_attention_masks        r:   get_extended_attention_maskz/TFConvBertMainLayer.get_extended_attention_mask  sn    !W[!44N #%*^k!naQRT_`aTb=c"d"d #%'*A5"I"I#&)@#@H"L&&r;   c                8    |t           d g| j        j        z  }|S rl   )r   r#   r  )r7   r   s     r:   get_head_maskz!TFConvBertMainLayer.get_head_mask  s%     %%!>>Ir;   NFc           	        ||t          d          |t          |          }n)|t          |          d d         }nt          d          |t          j        |d          }|t          j        |d          }|                     |||||
          }|                     |||j                  }|                     |          }t          | d          r| 	                    ||
          }| 
                    ||||||	|
          }|S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerZ   z5You have to specify either input_ids or inputs_embedsr   r   r   r2  )rb   r   rD   rd   rB   r@  r   rB  hasattrr2  r3  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  rS   rM   r   r?  s                 r:   rk   zTFConvBertMainLayer.call  s<     ]%>cddd"$Y//KK&$]33CRC8KKTUUU!W[!44N!W[!44N	<Q^iqrr"&"B"B>S^`m`s"t"t&&y11	4-.. 	V 33MH3UUM#  % 
 
 r;   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j	        j
        g           d d d            d S # 1 swxY w Y   d S d S )NTrB   r3  r2  )rI   rJ   rD   rE   rB   r(   rK   r3  r2  r#   r-   rL   s     r:   rK   zTFConvBertMainLayer.build  s	   : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4-t44@t6;<< X X'--tT4;;U.VWWWX X X X X X X X X X X X X X X X X X A@6    A''A+.A+!CCC(D77D;>D;
NNNNNNNNNFrl   )rm   rn   ro   r    config_classr,   r6  r:  r=  r@  rB  r   rk   rK   rq   rr   s   @r:   r0  r0  d  s        !L	 	 	 	 	  4 4 4" " "' ' '*    !- - - ]-^X X X X X X X Xr;   r0  c                      e Zd ZdZeZdZdS )TFConvBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    convbertN)rm   rn   ro   rp   r    rH  base_model_prefixr*   r;   r:   rJ  rJ    s'         
 "L"r;   rJ  ax	  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
zbThe bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFConvBertModelc                n     t                      j        |g|R i | t          |d          | _        d S )NrK  r   )r+   r,   r0  rK  r7   r#   ra   r8   r9   s       r:   r,   zTFConvBertModel.__init__N  sB    3&333F333+FDDDr;   batch_size, sequence_length
checkpointoutput_typerH  NFrN   TFModelInputType | Noner   np.array | tf.Tensor | NonerQ   rP   r   rR   rO   r   bool | Noner"  r#  rS   rT   rU   $TFBaseModelOutput | tuple[tf.Tensor]c                D    |                      |||||||||	|

  
        }|S )N
rN   r   rQ   rP   r   rR   r   r"  r#  rS   )rK  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  rS   r   s               r:   rk   zTFConvBertModel.callS  sA    ( --))%'/!5#   
 
 r;   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTrK  )rI   rJ   rD   rE   rK  r(   rK   rL   s     r:   rK   zTFConvBertModel.buildv  s    : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * * * * * 76s    A((A,/A,rG  )rN   rU  r   rV  rQ   rV  rP   rV  r   rV  rR   rO   r   rW  r"  rW  r#  rW  rS   rT   rU   rX  rl   )rm   rn   ro   r,   r   r   CONVBERT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCrk   rK   rq   rr   s   @r:   rN  rN  I  s        
E E E E E
 **+D+K+KLi+j+jkk&%$   .26:6:4815*.)-,0#'     lk ]8* * * * * * * *r;   rN  c                  F     e Zd Z fdZ fdZd Zd Zd Zd Zd Z	 xZ
S )TFConvBertMaskedLMHeadc                n     t                      j        di | || _        |j        | _        || _        d S r   )r+   r,   r#   r-   input_embeddings)r7   r#   rc  r8   r9   s       r:   r,   zTFConvBertMaskedLMHead.__init__  s@    ""6"""$3 0r;   c                    |                      | j        j        fddd          | _        t	                                          |           d S )NzerosTr   )r?   r@   r   r(   )rF   r#   rG   r   r+   rK   r   s     r:   rK   zTFConvBertMaskedLMHead.build  sE    OO4;+A*CQXdhouOvv	k"""""r;   c                    | j         S rl   )rc  r5  s    r:   get_output_embeddingsz,TFConvBertMaskedLMHead.get_output_embeddings  s    $$r;   c                \    || j         _        t          |          d         | j         _        d S r8  )rc  r>   r   rG   r9  s     r:   set_output_embeddingsz,TFConvBertMaskedLMHead.set_output_embeddings  s+    ',$+5e+<+<Q+?(((r;   c                    d| j         iS )Nr   )r   r5  s    r:   get_biaszTFConvBertMaskedLMHead.get_bias  s    	""r;   c                j    |d         | _         t          |d                   d         | j        _        d S )Nr   r   )r   r   r#   rG   r9  s     r:   set_biaszTFConvBertMaskedLMHead.set_bias  s.    &M	!+E&M!:!:1!=r;   c                @   t          |          d         }t          j        |d| j        g          }t          j        || j        j        d          }t          j        |d|| j        j        g          }t          j	        
                    || j                  }|S )N)tensorr   rZ   )ro  r?   T)abr   r   )r   rD   r   r-   r   rc  r>   r#   rG   r   r   r   )r7   r   
seq_lengths      r:   rk   zTFConvBertMaskedLMHead.call  s    }555a8

-DDW?XYYY	MT5J5Q_cddd
-JPTP[Pf?ghhh]KKr;   )rm   rn   ro   r,   rK   rg  ri  rk  rm  rk   rq   rr   s   @r:   ra  ra    s        1 1 1 1 1# # # # #
% % %@ @ @# # #> > >      r;   ra  c                  .     e Zd Z fdZddZddZ xZS )TFConvBertGeneratorPredictionsc                     t                      j        di | t          j                            |j        d          | _        t          j                            |j        d          | _	        || _
        d S )Nr%   r&   r   r   r*   )r+   r,   r   r0   r1   r2   r%   r   r-   r   r#   r6   s      r:   r,   z'TFConvBertGeneratorPredictions.__init__  sj    ""6"""88AV]h8ii\''(=G'LL
r;   Fc                    |                      |          } t          d          |          }|                     |          }|S )Ngelu)r   r   r%   )r7   generator_hidden_statesrS   r   s       r:   rk   z#TFConvBertGeneratorPredictions.call  sC    

#:;;1)&11-@@}55r;   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j	        j                  5  | j	                            d d | j        j
        g           d d d            d S # 1 swxY w Y   d S d S r  )rI   rJ   rD   rE   r%   r(   rK   r#   r-   r   r   rL   s     r:   rK   z$TFConvBertGeneratorPredictions.build  s   : 	F
4d++7t~233 O O$$dD$+2L%MNNNO O O O O O O O O O O O O O O4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H H H H 43r   r   rl   r   rr   s   @r:   rt  rt    sh               	H 	H 	H 	H 	H 	H 	H 	Hr;   rt  z6ConvBERT Model with a `language modeling` head on top.c                       e Zd Z fdZd Zd Ze ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFConvBertForMaskedLMc                d    t                      j        |fi | || _        t          |d          | _        t          |d          | _        t          |j        t                    rt          |j                  | _        n|j        | _        t          || j        j        d          | _        d S )NrK  r   generator_predictionsgenerator_lm_head)r+   r,   r#   r0  rK  rt  r}  r   r   r   r   r~   ra  rB   r~  rP  s       r:   r,   zTFConvBertForMaskedLM.__init__  s    **6***+FDDD%CFQh%i%i%i"f'-- 	0/0ABBDOO$/DO!7@X_r!s!s!sr;   c                    | j         S rl   )r~  r5  s    r:   get_lm_headz!TFConvBertForMaskedLM.get_lm_head  s    %%r;   c                0    | j         dz   | j        j         z   S )N/)r(   r~  r5  s    r:   get_prefix_bias_namez*TFConvBertForMaskedLM.get_prefix_bias_name  s    y3!7!<<<r;   rQ  rR  NFrN   rU  r   np.ndarray | tf.Tensor | NonerQ   rP   r   rR   rO   r   rW  r"  r#  labelsrS   rU   tuple | TFMaskedLMOutputc                N   |                      |||||||||	|
  
        }|d         }|                     ||          }|                     ||          }|
dn|                     |
|          }|	s|f|dd         z   }||f|z   n|S t	          |||j        |j                  S )a  
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        rZ  r   r   Nr   losslogitsr   r  )rK  r}  r~  hf_compute_lossr   r   r  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  r  rS   rx  generator_sequence_outputprediction_scoresr  r   s                    r:   rk   zTFConvBertForMaskedLM.call  s    6 #'--))%'/!5# #0 #
 #
 %<A$>! 667P[c6dd 223Dx2XX~tt4+?+?HY+Z+Z 	F'),CABB,GGF)-)9TGf$$vE$1?.9	
 
 
 	
r;   c                r   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTrK  r}  r~  )	rI   rJ   rD   rE   rK  r(   rK   r}  r~  rL   s     r:   rK   zTFConvBertForMaskedLM.build  s   : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * *40$77Ct9>?? 7 7*006667 7 7 7 7 7 7 7 7 7 7 7 7 7 74,d33?t5:;; 3 3&,,T2223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 @?r  NNNNNNNNNNF)rN   rU  r   r  rQ   r  rP   r  r   r  rR   rO   r   rW  r"  rW  r#  rW  r  rO   rS   rW  rU   r  rl   )rm   rn   ro   r,   r  r  r   r   r\  r]  r   r^  r   r_  rk   rK   rq   rr   s   @r:   r{  r{    s       t t t t t& & &= = = **+D+K+KLi+j+jkk&$$   .28<8<6:37*.)-,0#'#' %/
 /
 /
 /
  lk ]/
b3 3 3 3 3 3 3 3r;   r{  c                  0     e Zd ZdZ fdZd ZddZ xZS )TFConvBertClassificationHeadz-Head for sentence-level classification tasks.c                    t                      j        di | t          j                            |j        t          |j                  d          | _        |j	        |j	        n|j
        }t          j                            |          | _        t          j                            |j        t          |j                  d          | _        || _        d S )Nr   rx   out_projr*   )r+   r,   r   r0   r   r   r   r/   r   classifier_dropoutr4   r3   r5   
num_labelsr  r#   )r7   r#   r8   r  r9   s       r:   r,   z%TFConvBertClassificationHead.__init__  s    ""6"""\''?6C[3\3\cj ( 
 

 *0)B)NF%%TZTn 	 |++,>??**/&BZ2[2[bl + 
 
 r;   c                   |d d dd d f         }|                      |          }|                     |          } t          | j        j                  |          }|                      |          }|                     |          }|S r8  )r5   r   r   r#   r   r  )r7   r   r8   r   s       r:   rk   z!TFConvBertClassificationHead.call/  sz    !!!Q'"LLOOJJqMM5dk455a88LLOOMM!r;   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j	        j                  5  | j	                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r  )
rI   rJ   rD   rE   r   r(   rK   r#   r   r  rL   s     r:   rK   z"TFConvBertClassificationHead.build9  s   : 	F
4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4T**6t}122 K K##T41H$IJJJK K K K K K K K K K K K K K K K K K 76r   rl   )rm   rn   ro   rp   r,   rk   rK   rq   rr   s   @r:   r  r    si        77       	K 	K 	K 	K 	K 	K 	K 	Kr;   r  zp
    ConvBERT Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks.
    c                       e Zd Z fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )#TFConvBertForSequenceClassificationc                     t                      j        |g|R i | |j        | _        t          |d          | _        t          |d          | _        d S )NrK  r   
classifier)r+   r,   r  r0  rK  r  r  rP  s       r:   r,   z,TFConvBertForSequenceClassification.__init__L  s`    3&333F333 ++FDDD6vLQQQr;   rQ  rR  NFrN   rU  r   r  rQ   rP   r   rR   rO   r   rW  r"  r#  r  rS   rU   "tuple | TFSequenceClassifierOutputc                   |                      |||||||||	|
  
        }|                     |d         |          }|
dn|                     |
|          }|	s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        	r   rQ   rP   r   rR   r   r"  r#  rS   r   r   Nr   r  )rK  r  r  r
   r   r  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  r  rS   r   r  r  r   s                   r:   rk   z(TFConvBertForSequenceClassification.callR  s    6 --))%'/!5#   
 
 h??~tt4+?+?+O+O 	FY,F)-)9TGf$$vE)!/)	
 
 
 	
r;   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S NTrK  r  )rI   rJ   rD   rE   rK  r(   rK   r  rL   s     r:   rK   z)TFConvBertForSequenceClassification.build  sS   : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * *4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , , , , , 98r   r  )rN   rU  r   r  rQ   r  rP   r  r   r  rR   rO   r   rW  r"  rW  r#  rW  r  rO   rS   rW  rU   r  rl   )rm   rn   ro   r,   r   r   r\  r]  r   r^  r
   r_  rk   rK   rq   rr   s   @r:   r  r  E  s        R R R R R **+D+K+KLi+j+jkk&.$   .28<8<6:37*.)-,0#'#' %-
 -
 -
 -
  lk ]-
^	, 	, 	, 	, 	, 	, 	, 	,r;   r  z
    ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Z fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFConvBertForMultipleChoicec                &    t                      j        |g|R i | t          |d          | _        t	          ||j        d          | _        t          j        	                    dt          |j                  d          | _        || _        d S )NrK  r   sequence_summary)r/   r(   r   r  rx   )r+   r,   r0  rK  r   r/   r  r   r0   r   r   r  r#   rP  s       r:   r,   z$TFConvBertForMultipleChoice.__init__  s    3&333F333+FDDD 1f&>EW!
 !
 !
  ,,,/&2J"K"KR^ - 
 
 r;   z(batch_size, num_choices, sequence_lengthrR  NFrN   rU  r   r  rQ   rP   r   rR   rO   r   rW  r"  r#  r  rS   rU   #tuple | TFMultipleChoiceModelOutputc                X   |+t          |          d         }t          |          d         }n*t          |          d         }t          |          d         }|t          j        |d|f          nd}|t          j        |d|f          nd}|t          j        |d|f          nd}|t          j        |d|f          nd}|+t          j        |d|t          |          d         f          nd}|                     |||||||||	|
  
        }|                     |d         |          }|                     |          }t          j        |d|f          }|
dn|                     |
|          }|	s|f|dd         z   }||f|z   n|S t          |||j        |j	        	          S )
a5  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        Nr   r   rZ   r   )r#  rS   r   r   r  )
r   rD   r   rK  r  r  r  r   r   r  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  r  rS   num_choicesrr  flat_input_idsflat_attention_maskflat_token_type_idsflat_position_idsflat_inputs_embedsr   r  reshaped_logitsr  r   s                           r:   rk   z TFConvBertForMultipleChoice.call  s   8  $Y//2K#I..q1JJ$]33A6K#M2215JDMDYIJ/?@@@_cN\Nhbj"j9IJJJnrN\Nhbj"j9IJJJnrJVJbBJ|b*5EFFFhl ( J}r:z-7P7PQR7S&TUUU 	
 -- #   
 
 &&wqzH&EE((*Vb+->??~tt4+?+?+X+X 	F%''!""+5F)-)9TGf$$vE*"!/)	
 
 
 	
r;   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j	        j
        g           d d d            d S # 1 swxY w Y   d S d S )NTrK  r  r  )rI   rJ   rD   rE   rK  r(   rK   r  r  r#   r   rL   s     r:   rK   z!TFConvBertForMultipleChoice.build  s   : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * *4+T22>t49:: 2 2%++D1112 2 2 2 2 2 2 2 2 2 2 2 2 2 24t,,8t344 M M%%tT4;3J&KLLLM M M M M M M M M M M M M M M M M M 98rF  r  )rN   rU  r   r  rQ   r  rP   r  r   r  rR   rO   r   rW  r"  rW  r#  rW  r  rO   rS   rW  rU   r  rl   )rm   rn   ro   r,   r   r   r\  r]  r   r^  r   r_  rk   rK   rq   rr   s   @r:   r  r    s        
 
 
 
 
 **!(()STT   &/$   .28<8<6:37*.)-,0#'#' %>
 >
 >
 >
   ]>
@M M M M M M M Mr;   r  z
    ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS ) TFConvBertForTokenClassificationc                    t                      j        |g|R i | |j        | _        t          |d          | _        |j        |j        n|j        }t          j        	                    |          | _
        t          j                            |j        t          |j                  d          | _        || _        d S )NrK  r   r  rx   )r+   r,   r  r0  rK  r  r4   r   r0   r3   r5   r   r   r/   r  r#   )r7   r#   ra   r8   r  r9   s        r:   r,   z)TFConvBertForTokenClassification.__init__  s    3&333F333 ++FDDD)/)B)NF%%TZTn 	 |++,>??,,,/&BZ2[2[bn - 
 
 r;   rQ  rR  NFrN   rU  r   r  rQ   rP   r   rR   rO   r   rW  r"  r#  r  rS   rU   tuple | TFTokenClassifierOutputc                J   |                      |||||||||	|
  
        }|d         }|                     ||          }|                     |          }|
dn|                     |
|          }|	s|f|dd         z   }||f|z   n|S t	          |||j        |j                  S )z
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   r   Nr   r  )rK  r5   r  r  r   r   r  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  r  rS   r   sequence_outputr  r  r   s                    r:   rk   z%TFConvBertForTokenClassification.call  s    2 --))%'/!5#   
 
 "!*,,,JJ11~tt4+?+?+O+O 	FY,F)-)9TGf$$vE&!/)	
 
 
 	
r;   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j	        g           d d d            d S # 1 swxY w Y   d S d S r  )
rI   rJ   rD   rE   rK  r(   rK   r  r#   r   rL   s     r:   rK   z&TFConvBertForTokenClassification.buildK  s   : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * *4t,,8t344 M M%%tT4;3J&KLLLM M M M M M M M M M M M M M M M M M 98$    A''A+.A+!(CCCr  )rN   rU  r   r  rQ   r  rP   r  r   r  rR   rO   r   rW  r"  rW  r#  rW  r  rO   rS   rW  rU   r  rl   )rm   rn   ro   r,   r   r   r\  r]  r   r^  r   r_  rk   rK   rq   rr   s   @r:   r  r     s             **+D+K+KLi+j+jkk&+$   .28<8<6:37*.)-,0#'#' %,
 ,
 ,
 ,
  lk ],
\	M 	M 	M 	M 	M 	M 	M 	Mr;   r  z
    ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZe ee                    d                     ee	e
e          	 	 	 	 	 	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFConvBertForQuestionAnsweringc                    t                      j        |g|R i | |j        | _        t          |d          | _        t
          j                            |j        t          |j	                  d          | _
        || _        d S )NrK  r   
qa_outputsrx   )r+   r,   r  r0  rK  r   r0   r   r   r/   r  r#   rP  s       r:   r,   z'TFConvBertForQuestionAnswering.__init___  s    3&333F333 ++FDDD,,,/&BZ2[2[bn - 
 
 r;   rQ  rR  NFrN   rU  r   r  rQ   rP   r   rR   rO   r   rW  r"  r#  start_positionsend_positionsrS   rU   &tuple | TFQuestionAnsweringModelOutputc                   |                      |||||||||	|
  
        }|d         }|                     |          }t          j        |dd          \  }}t          j        |d          }t          j        |d          }d}|
#|!d|
i}||d<   |                     |||f          }|	s||f|d	d         z   }||f|z   n|S t          ||||j        |j        
          S )a  
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        r  r   r   rZ   r_   Nstart_positionend_positionr   )r  start_logits
end_logitsr   r  )	rK  r  rD   splitsqueezer  r	   r   r  )r7   rN   r   rQ   rP   r   rR   r   r"  r#  r  r  rS   r   r  r  r  r  r  r  r   s                        r:   rk   z#TFConvBertForQuestionAnswering.calli  s;   @ --))%'/!5#   
 
 "!*11#%8FAB#?#?#? jz,R888Z
444
&=+D&8F%2F>"''z0JKKD 	F"J/'!""+=F)-)9TGf$$vE-%!!/)
 
 
 	
r;   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j	        g           d d d            d S # 1 swxY w Y   d S d S )NTrK  r  )
rI   rJ   rD   rE   rK  r(   rK   r  r#   r   rL   s     r:   rK   z$TFConvBertForQuestionAnswering.build  r  r  )NNNNNNNNNNNF)rN   rU  r   r  rQ   r  rP   r  r   r  rR   rO   r   rW  r"  rW  r#  rW  r  rO   r  rO   rS   rW  rU   r  rl   )rm   rn   ro   r,   r   r   r\  r]  r   r^  r	   r_  rk   rK   rq   rr   s   @r:   r  r  W  s             **+D+K+KLi+j+jkk&2$   .28<8<6:37*.)-,0#',0*. %;
 ;
 ;
 ;
  lk ];
z	M 	M 	M 	M 	M 	M 	M 	Mr;   r  )r{  r  r  r  r  r	  rN  rJ  )Grp   
__future__r   numpynp
tensorflowrD   activations_tfr   modeling_tf_outputsr   r   r   r	   r
   r   modeling_tf_utilsr   r   r   r   r   r   r   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   configuration_convbertr    
get_loggerrm   loggerr^  r_  r0   Layerr"   rt   r   r   r   r   r  r	  r  r*  r0  rJ  CONVBERT_START_DOCSTRINGr\  rN  ra  rt  r{  r  r  r  r  r  __all__r*   r;   r:   <module>r     s7     " " " " " "         / / / / / /                                           S R R R R R R R R R            3 2 2 2 2 2 
	H	%	%/ "Q  Q  Q  Q  Q 5<- Q  Q  Q hlQ lQ lQ lQ lQel0 lQ lQ lQ^L L L L L5<- L L L<. . . . .%,, . . .>    +   B"H "H "H "H "HU\/ "H "H "HJ$N $N $N $N $Nu|) $N $N $NN- - - - -el( - - -D1& 1& 1& 1& 1&* 1& 1& 1&h L  L  L  L  L(:  L  L  LF uX uX uX uX uX%,, uX uX uXp# # # # # 1 # # #( T5 p h /* /* /* /* /*/ /* /*	 /*d" " " " "U\/ " " "JH H H H HU\%7 H H H6 RTlmmY3 Y3 Y3 Y3 Y357S Y3 Y3 nmY3x&K &K &K &K &K5<#5 &K &K &KR  	 F, F, F, F, F,*CEa F, F, F,R   bM bM bM bM bM";=Q bM bM bMJ   MM MM MM MM MM'@B[ MM MM MM`   XM XM XM XM XM%>@W XM XM XMv	 	 	r;   