
     `i                     Z   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl	m
Z
 ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  e#j&        e'          Z(d Z) G d de
j*                  Z+e" G d de                      Z, G d de
j*                  Z- G d de
j*                  Z. G d de
j*                  Z/ G d de
j*                  Z0 G d de
j*                  Z1 G d d e
j*                  Z2 G d! d"e
j*                  Z3 G d# d$e          Z4 G d% d&e
j*                  Z5 G d' d(e
j*                  Z6 G d) d*e
j*                  Z7e" G d+ d,e,                      Z8 G d- d.e
j*                  Z9e" G d/ d0e,                      Z: G d1 d2e
j*                  Z; e"d34           G d5 d6e,                      Z<e" G d7 d8e,                      Z=e" G d9 d:e,                      Z>e" G d; d<e,                      Z?g d=Z@dS )>zPyTorch ConvBERT model.    N)
attrgetter)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )ConvBertConfigc                    	 ddl }n)# t          $ r t                              d            w xY wt          j                            |          }t                              d|            |j        	                    |          }i }|D ]E\  }}t                              d| d|            |j        
                    ||          }	|	||<   Fddd	d
dddd}
|j        dk    rd}nd}t          |j                  D ]:}d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d |
d| d!<   d| d"|
d| d#<   d| d$|
d| d%<   d| d&|
d| d'<   d| d(|
d| d)<   d| d*|
d| d+<   d| d,|
d| d-<   d| d.|
d| d/<   d| d0|
d| d1<   d| d2|
d| d3<   d| d4|
d| d5<   d| d6| d7|
d| d8<   d| d6| d9|
d| d:<   d| d;| d7|
d| d<<   d| d;| d9|
d| d=<   d| d>|
d| d?<   d| d@|
d| dA<   <|                                 D ]7}|d         }t          |          } ||           }|
|         }t!          j        ||                   }t                              dB| dC| dD           |                    d7          r1|                    dE          s|                    dF          s|j        }|                    dG          r|                    ddHd          }|                    dI          r|                    dHdd          }|                    dJ          r|                    dK          }||_        9| S )Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   g_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variable
num_groupsrangenum_hidden_layersnamed_parametersr   torch
from_numpyendswithTpermute	unsqueezedata)modelconfigtf_checkpoint_pathtftf_path	init_varstf_datanameshapearrayparam_mappinggroup_dense_namejparam
param_name	retrieverresulttf_namevalues                      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbertrM   0   sX      Q	
 	
 	
 	 goo011G
KKBBBCCC''00IG   eBBB5BBCCC&&w55 .R1Y3]'K%H%H#D M 1$"6+,, Cw CwDQDDD 	FqFFFG CQBBB 	DqDDDE CQBBB 	DqDDDE AQ@@@ 	BqBBBC EQDDD 	FqFFFG CQBBB 	DqDDDE WQVVV 	^q^^^_ WQVVV 	^q^^^_ KQJJJ 	RqRRRS PQOOO 	RqRRRS NQMMM 	PqPPPQ OQNNN 	OqOOOP MQLLL 	MqMMMN GQFFF 	HqHHHI JQIII 	LqLLLM EQDDD 	FqFFFG IQHHH 	JqJJJK PQOO6FOOO 	DqDDDE NQMM6FMMM 	BqBBBC JQII0@III 	>q>>>? HQGG0@GGG 	<q<<<= @Q??? 	BqBBBC Ew]^DvDvDv@q@@@AA''))  1X
z**	5!!
+ !1227777*777888I&& 	$##$BCC $''(@AA $!GE/00 	+MM!Q**E/00 	+MM!Q**E122 	(OOB''ELs    &-c                        e Zd ZdZ fdZ	 	 	 	 d
deej                 deej                 deej                 deej                 dej        f
d	Z	 xZ
S )ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          d           |                     dt%          j        | j                                        t$          j                  d           d S )	N)padding_idxepsposition_ids)r   r"   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr2   arangeexpandzerosrT   sizelongselfr:   	__class__s     rL   rZ   zConvBertEmbeddings.__init__   s5   !|F,=v?Tbhbuvvv#%<0NPVPe#f#f %'\&2H&J_%`%`" f&;AVWWWz&"<==EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
    N	input_idsrV   rT   inputs_embedsreturnc                 j   ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }	|                     |          }
||	z   |
z   }|                     |          }|                     |          }|S )Nr"   r   rV   r   rX   device)rm   rT   hasattrrV   rk   r2   rl   rn   rx   r_   ra   rc   rd   rh   )rp   rs   rV   rT   rt   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedra   rc   
embeddingss               rL   forwardzConvBertEmbeddings.forward   sQ     #..**KK',,..ss3K ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M"66|DD $ : :> J J"%88;PP
^^J//
\\*--
rr   )NNNN)__name__
__module____qualname____doc__rZ   r   r2   
LongTensorFloatTensorr   __classcell__rq   s   @rL   rO   rO      s        QQ
 
 
 
 
( 15593759$ $E,-$ !!12$ u/0	$
   12$ 
	$ $ $ $ $ $ $ $rr   rO   c                   ,    e Zd ZU eed<   eZdZdZd Z	dS )ConvBertPreTrainedModelr:   convbertTc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    r |j	        j        
                                 dS t          |t                     rK|j        j                            d| j        j                   |j	        j        
                                 dS dS )zInitialize the weights        meanstdNg      ?)
isinstancer   LinearConv1dweightr8   normal_r:   initializer_rangebiaszero_r[   rQ   rd   fill_SeparableConv1DGroupedLinearLayer)rp   modules     rL   _init_weightsz%ConvBertPreTrainedModel._init_weights   s   fry")455 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S)))))00 	%K""$$$$$ 233 	%M&&CT[5R&SSSK""$$$$$	% 	%rr   N)
r   r   r   r   __annotations__rM   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr    rr   rL   r   r      sB         1O"&*#% % % % %rr   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )r   zSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    t                                                       t          j        |||||dz  d          | _        t          j        ||dd          | _        t          j        t          j        |d                    | _	        | j        j
        j                            d|j                   | j        j
        j                            d|j                   d S )Nr!   F)kernel_sizegroupspaddingr   r   )r   r   r   r   )rY   rZ   r   r   	depthwise	pointwise	Parameterr2   rl   r   r   r8   r   r   )rp   r:   input_filtersoutput_filtersr   kwargsrq   s         rL   rZ   zSeparableConv1D.__init__  s    # 1$
 
 
 =.aV[\\\L^Q!?!?@@	"**9Q*RRR"**9Q*RRRRRrr   hidden_statesru   c                 n    |                      |          }|                     |          }|| j        z  }|S N)r   r   r   )rp   r   xs      rL   r   zSeparableConv1D.forward  s4    NN=))NN1	TYrr   	r   r   r   r   rZ   r2   Tensorr   r   r   s   @rL   r   r     si        ]]S S S S S U\ el        rr   r   c                        e Zd Z fdZ	 	 	 	 ddej        deej                 deej                 deej                 dee         d	e	ej        eej                 f         fd
Z
 xZS )ConvBertSelfAttentionc                 p   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        |j        z  }|dk     r|j        | _        d| _        n|| _        |j        | _        |j        | _        |j        | j        z  dk    rt          d          |j        | j        z  dz  | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          ||j        | j	        | j                  | _        t          j        | j	        | j        | j        z            | _        t          j        |j        | j	                  | _        t          j        | j        dgt)          | j        dz
  dz            dg	          | _        t          j        |j                  | _        d S )
Nr   r]   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr!   )r   r   )rY   rZ   hidden_sizenum_attention_headsry   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   r   querykeyrK   r   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldrf   attention_probs_dropout_probrh   )rp   r:   new_num_attention_headsrq   s      rL   rZ   zConvBertSelfAttention.__init__   s&    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 #)"<@Q"Q"Q&&$8DO'(D$$'>D$$/DO & 7 88A==UVVV$*$6$:R$RWX#X !58PPYv143EFF
9V/1CDDYv143EFF
#2F&(:D<Q$
 $
  "$4+=t?WZ^Zo?o!p!p i(:D<NOOi.2S$BWZ[B[_`A`=a=acd<e
 
 
 z&"EFFrr   NFr   attention_mask	head_maskencoder_hidden_statesoutput_attentionsru   c                 V   |j         \  }}}|+|                     |          }	|                     |          }
n*|                     |          }	|                     |          }
|                     |                    dd                    }|                    dd          }|                     |          }|                    |d| j        | j                                      dd          }|	                    |d| j        | j                                      dd          }|
                    |d| j        | j                                      dd          }t          j
        ||          }|                     |          }t          j        |d| j        dg          }t          j        |d          }|                     |          }t          j        ||d| j        g          }|                    dd                                                              d          }t&          j                            || j        dgd| j        dz
  dz  dgd          }|                    dd                              |d| j        | j                  }t          j        |d| j        | j        g          }t          j        ||          }t          j        |d| j        g          }t          j        ||                    dd                    }|t/          j        | j                  z  }|||z   }t&          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dddd                                          }t          j        ||d| j        | j        g          }t          j        ||gd          }|                                d d         | j        | j        z  dz  fz   } |j        | }|r||fn|f}|S )	Nr   r!   r"   dimr   )r   dilationr   strider   )rA   r   rK   r   	transposer   viewr   r   r2   multiplyr   reshaper   softmaxr   r   
contiguousr7   r   
functionalr   matmulmathsqrtrh   r6   catrm   )rp   r   r   r   r   r   
batch_sizer{   _mixed_key_layermixed_value_layermixed_key_conv_attn_layermixed_query_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputss                            rL   r   zConvBertSelfAttention.forwardG  s:    %2$7!
J !,"hh'<==O $

+@ A A"hh}55O $

= 9 9$($<$<]=T=TUVXY=Z=Z$[$[!$=$G$G1$M$M! JJ}55',,D4d6N
 

)Aq// 	 $((R9QSWSkllvvq
 
	 (,,D4d6N
 

)Aq// 	  .)BDUVV 22?CC!M*;b$BWYZ=[\\!M*;CCC,,];;~
BHZ7[\\'11!Q77BBDDNNrRR--.2+a/A5q9 . 
 
 (11!Q77??D.0E
 
 ~D<TVZVk7lmmn6GHH~D<N7OPP !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF=*b$BZ\`\t1uvv	=(";Q?? #0"4"4"6"6ss";$t'??!C?
 #
 +*,CD6G]=/22mM]rr   NNNF)r   r   r   rZ   r2   r   r   r   booltupler   r   r   s   @rL   r   r     s        %G %G %G %G %GT 7;158<,1V V|V !!23V E-.	V
  (5V $D>V 
u|Xel33	4V V V V V V V Vrr   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ConvBertSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrR   )rY   rZ   r   r   r   r   rd   re   rf   rg   rh   ro   s     rL   rZ   zConvBertSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==rr   r   input_tensorru   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   rh   rd   rp   r   r   s      rL   r   zConvBertSelfOutput.forward  @    

=11]33}|'CDDrr   r   r   r   rZ   r2   r   r   r   r   s   @rL   r   r     si        > > > > >U\  RWR^        rr   r   c                        e Zd Z fdZd Z	 	 	 	 ddej        deej                 deej                 deej                 d	ee	         d
e
ej        eej                 f         fdZ xZS )ConvBertAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )rY   rZ   r   rp   r   outputsetpruned_headsro   s     rL   rZ   zConvBertAttention.__init__  sI    )&11	(00EErr   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   rp   r   r   r   r   r   r   rK   r   r   r   union)rp   headsindexs      rL   prune_headszConvBertAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::rr   NFr   r   r   r   r   ru   c                     |                      |||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )rp   r   )	rp   r   r   r   r   r   self_outputsattention_outputr   s	            rL   r   zConvBertAttention.forward  s[     yy!
 
  ;;|AFF#%QRR(88rr   r   )r   r   r   rZ   r  r2   r   r   r   r   r   r   r   r   s   @rL   r   r     s        " " " " "; ; ;* 7;158<,1 | !!23 E-.	
  (5 $D> 
u|Xe&788	9       rr   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )r   c                    t                                                       || _        || _        || _        | j        | j        z  | _        | j        | j        z  | _        t          j        t          j
        | j        | j        | j                            | _        t          j        t          j
        |                    | _        d S r   )rY   rZ   
input_sizeoutput_sizer.   group_in_dimgroup_out_dimr   r   r2   emptyr   r   )rp   r  r  r.   rq   s       rL   rZ   zGroupedLinearLayer.__init__  s    $&$ Ot>!-@l5;t@QSWSe#f#fggL[!9!9::			rr   r   ru   c                 v   t          |                                          d         }t          j        |d| j        | j        g          }|                    ddd          }t          j        || j                  }|                    ddd          }t          j        ||d| j	        g          }|| j
        z   }|S )Nr   r"   r   r!   )listrm   r2   r   r.   r  r6   r   r   r  r   )rp   r   r   r   s       rL   r   zGroupedLinearLayer.forward  s    -,,..//2
M-"dot?P)QRRIIaALDK((IIaAM!j"d.>?@@	Mrr   r   r   s   @rL   r   r     s^        ; ; ; ; ;U\ el        rr   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )ConvBertIntermediatec                 r   t                                                       |j        dk    r%t          j        |j        |j                  | _        n&t          |j        |j        |j                  | _        t          |j
        t                    rt          |j
                 | _        d S |j
        | _        d S )Nr   r  r  r.   )rY   rZ   r.   r   r   r   intermediate_sizer   r   r   
hidden_actstrr   intermediate_act_fnro   s     rL   rZ   zConvBertIntermediate.__init__  s    !!6#5v7OPPDJJ+!-6;S`f`q  DJ f'-- 	9'-f.?'@D$$$'-'8D$$$rr   r   ru   c                 Z    |                      |          }|                     |          }|S r   )r   r  rp   r   s     rL   r   zConvBertIntermediate.forward  s,    

=1100??rr   r   r   s   @rL   r  r    s^        9 9 9 9 9U\ el        rr   r  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ConvBertOutputc                 z   t                                                       |j        dk    r%t          j        |j        |j                  | _        n&t          |j        |j        |j                  | _        t          j	        |j        |j
                  | _	        t          j        |j                  | _        d S )Nr   r  rR   )rY   rZ   r.   r   r   r  r   r   r   rd   re   rf   rg   rh   ro   s     rL   rZ   zConvBertOutput.__init__  s    !!6#;V=OPPDJJ+!3AS`f`q  DJ f&8f>STTTz&"<==rr   r   r   ru   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      rL   r   zConvBertOutput.forward  r   rr   r   r   s   @rL   r  r    si        	> 	> 	> 	> 	>U\  RWR^        rr   r  c                        e Zd Z fdZ	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 d	ee         d
e	ej        eej                 f         fdZ
d Z xZS )ConvBertLayerc                 ~   t                                                       |j        | _        d| _        t	          |          | _        |j        | _        |j        | _        | j        r-| j        st          |  d          t	          |          | _	        t          |          | _        t          |          | _        d S )Nr   z> should be used as a decoder model if cross attention is added)rY   rZ   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr  intermediater  r   ro   s     rL   rZ   zConvBertLayer.__init__  s    '-'E$*622 +#)#= # 	<? i4 g g ghhh"3F";";D088$V,,rr   NFr   r   r   r   encoder_attention_maskr   ru   c                 ^   |                      ||||          }|d         }|dd          }	| j        rS|Qt          | d          st          d|  d          |                     |||||          }
|
d         }|	|
dd          z   }	t          | j        | j        | j        |          }|f|	z   }	|	S )N)r   r   r   r)  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r%  r&  ry   AttributeErrorr)  r   feed_forward_chunkr#  r$  )rp   r   r   r   r   r+  r   self_attention_outputsr  r   cross_attention_outputslayer_outputs               rL   r   zConvBertLayer.forward'  s    "&/	 "0 "
 "
 2!4(,? 	<4@4!122 $Dd D D D   '+&9&9 &%!' '#  7q9 7 ;;G0#T%A4CSUe
 
  /G+rr   c                 \    |                      |          }|                     ||          }|S r   )r*  r   )rp   r  intermediate_outputr1  s       rL   r.  z ConvBertLayer.feed_forward_chunkO  s2    "//0@AA{{#68HIIrr   )NNNNF)r   r   r   rZ   r2   r   r   r   r   r   r   r.  r   r   s   @rL   r!  r!    s        - - - - -" 7;158<9=,1& &|& !!23& E-.	&
  (5& !) 6& $D>& 
u|Xe&788	9& & & &P      rr   r!  c                        e Zd Z fdZ	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ef         fdZ xZS )ConvBertEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r   )r!  ).0r   r:   s     rL   
<listcomp>z,ConvBertEncoder.__init__.<locals>.<listcomp>Y  s!    #c#c#caM&$9$9#c#c#crr   F)	rY   rZ   r:   r   
ModuleListr/   r0   layergradient_checkpointingro   s    `rL   rZ   zConvBertEncoder.__init__V  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###rr   NFTr   r   r   r   r+  r   output_hidden_statesreturn_dictru   c	           	         |rdnd }	|rdnd }
|r| j         j        rdnd }t          | j                  D ]W\  }}|r|	|fz   }	|||         nd } |||||||          }|d         }|r$|
|d         fz   }
| j         j        r||d         fz   }X|r|	|fz   }	|st	          d ||	|
|fD                       S t          ||	|
|          S )Nr   r   r   r!   c              3      K   | ]}||V  	d S r   r   )r8  vs     rL   	<genexpr>z*ConvBertEncoder.forward.<locals>.<genexpr>  s0        =  === rr   )last_hidden_stater   
attentionscross_attentions)r:   r'  	enumerater;  r   r   )rp   r   r   r   r   r+  r   r=  r>  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                   rL   r   zConvBertEncoder.forward\  sp    #7@BBD$5?bb4%6d4;;Zdrr`d(44 	V 	VOA|# I$58H$H!.7.CillO(L%&! M *!,M  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	  '):<OQef     
 2++*1	
 
 
 	
rr   )NNNNFFT)r   r   r   rZ   r2   r   r   r   r   r   r   r   r   r   r   s   @rL   r5  r5  U  s        , , , , , 7;158<9=,1/4&*0
 0
|0
 !!230
 E-.	0

  (50
 !) 60
 $D>0
 'tn0
 d^0
 
u88	90
 0
 0
 0
 0
 0
 0
 0
rr   r5  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ConvBertPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r   )rY   rZ   r   r   r   r   r   r  r  r   transform_act_fnrd   re   ro   s     rL   rZ   z(ConvBertPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTrr   r   ru   c                     |                      |          }|                     |          }|                     |          }|S r   )r   rQ  rd   r  s     rL   r   z'ConvBertPredictionHeadTransform.forward  s=    

=11--m<<}55rr   r   r   s   @rL   rO  rO    sc        U U U U UU\ el        rr   rO  c                   j     e Zd ZdZdef fdZ	 d	dej        deej	                 dej        fdZ
 xZS )
ConvBertSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ConvBertConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r:   c                 V   t                                                       t          |dd          | _        | j        dk    rt          t          j                    | _        t          |d          rW|j	        rPt          |d          r|j
        r|j        dk    r|j        }n|j        }t          j        |j        |          | _        t          |dd           }|rt          |          nt          j                    | _        t          j                    | _        t          |d          r)|j        dk    rt          j        |j                  | _        t          j                    | _        t          |d	          r+|j        dk    r"t          j        |j                  | _        d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)rY   rZ   getattrrV  NotImplementedErrorr   Identitysummaryry   rY  rZ  
num_labelsr   r   r   
activationfirst_dropoutr\  rf   last_dropoutr]  )rp   r:   num_classesactivation_stringrq   s       rL   rZ   z ConvBertSequenceSummary.__init__  s   #FNFCC&& &%{}}6-.. 	F63J 	Fv788 1V=Z 1_e_pst_t_t$/$09V%7EEDL#F,@$GGIZ$mN3D$E$E$E`b`k`m`m[]]6233 	J8TWX8X8X!#F,H!I!IDKMM6122 	Hv7RUV7V7V "
6+F G GD	H 	H7V7Vrr   Nr   	cls_indexru   c                 :   | j         dk    r|dddf         }n-| j         dk    r|dddf         }n| j         dk    r|                    d          }n| j         d	k    r|=t          j        |d
ddddf         |j        d         dz
  t          j                  }nl|                    d                              d          }|                    d|                                dz
  z  |	                    d          fz             }|
                    d|                              d          }n| j         dk    rt          |                     |          }|                     |          }|                     |          }|                     |          }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        rW  Nr"   firstr   r   r   r   rh  .r   rW   )r"   rX  )rV  r   r2   	full_likerA   rn   r7   rk   r   rm   gathersqueezer_  rd  ra  rc  re  )rp   r   rh  r   s       rL   r   zConvBertSequenceSummary.forward  s    &&"111b5)FF'))"111a4(FF&(("''A'..FF+-- !O!#rr111*-!'+a/*  		 &//33==bAA	%,,Uimmoo6I-JmN`N`acNdNdMf-fgg	"))"i88@@DDFF&((%%##F++f%%((""6**rr   r   )r   r   r   r   r   rZ   r2   r   r   r   r   r   r   s   @rL   rT  rT    s         2H~ H H H H H H< Y]) )".);CEDT;U)		) ) ) ) ) ) ) )rr   rT  c                   <    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j
                 d	ee	j
                 d
ee	j                 dee	j                 dee         dee         dee         deeef         fd            Z xZS )ConvBertModelc                 8   t                                          |           t          |          | _        |j        |j        k    r$t          j        |j        |j                  | _        t          |          | _
        || _        |                                  d S r   )rY   rZ   rO   r~   r]   r   r   r   embeddings_projectr5  encoderr:   	post_initro   s     rL   rZ   zConvBertModel.__init__  s       ,V44 F$666&(i0EvGY&Z&ZD#&v..rr   c                     | j         j        S r   r~   r_   rp   s    rL   get_input_embeddingsz"ConvBertModel.get_input_embeddings  s    ..rr   c                     || j         _        d S r   ru  )rp   rK   s     rL   set_input_embeddingsz"ConvBertModel.set_input_embeddings  s    */'''rr   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrr  r;  r%  r  )rp   heads_to_pruner;  r  s       rL   _prune_headszConvBertModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Crr   Nrs   r   rV   rT   r   rt   r   r=  r>  ru   c
                    ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        |
|          }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          }t          | d          r|                     |          }|                     ||||||		          }|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer"   z5You have to specify either input_ids or inputs_embeds)rx   rV   rw   )rs   rT   rV   rt   rq  )r   r   r   r=  r>  )r:   r   r=  use_return_dictr   %warn_if_padding_and_no_attention_maskrm   rx   r2   onesry   r~   rV   rk   rl   rn   get_extended_attention_maskget_head_maskr0   rq  rr  )rp   rs   r   rV   rT   r   rt   r   r=  r>  rz   r   r{   rx   r|   r}   extended_attention_maskr   s                     rL   r   zConvBertModel.forward   s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"ZFCCCN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z"&"B"B>S^"_"_&&y$+2OPP	l>iv ( 
 
 4-.. 	C 33MBBM2/!5# % 
 
 rr   )	NNNNNNNNN)r   r   r   rZ   rw  ry  r}  r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   ro  ro    sS       
 
 
 
 
/ / /0 0 0C C C  156:59371559,0/3&*< <E,-< !!23< !!12	<
 u/0< E-.<   12< $D>< 'tn< d^< 
u88	9< < < ^< < < < <rr   ro  c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    t                                                       t          d          | _        t	          j        |j        |j                  | _        t	          j        |j	        |j                  | _
        d S )NgelurR   )rY   rZ   r   rc  r   rd   r]   re   r   r   r   ro   s     rL   rZ   z%ConvBertGeneratorPredictions.__init__c  sa    (00f&;AVWWWYv163HII


rr   generator_hidden_statesru   c                     |                      |          }|                     |          }|                     |          }|S r   )r   rc  rd   )rp   r  r   s      rL   r   z$ConvBertGeneratorPredictions.forwardj  s<    

#:;;66}55rr   )	r   r   r   r   rZ   r2   r   r   r   r   s   @rL   r  r  `  sk        KKJ J J J Ju/@ UEV        rr   r  c                   X    e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j
                 d	ee	j
                 d
ee	j                 dee	j                 dee	j
                 dee         dee         dee         deeef         fd            Z xZS )ConvBertForMaskedLMzgenerator.lm_head.weightc                 
   t                                          |           t          |          | _        t	          |          | _        t          j        |j        |j	                  | _
        |                                  d S r   )rY   rZ   ro  r   r  generator_predictionsr   r   r]   r\   generator_lm_headrs  ro   s     rL   rZ   zConvBertForMaskedLM.__init__v  sj       %f--%A&%I%I"!#6+@&BS!T!Trr   c                     | j         S r   r  rv  s    rL   get_output_embeddingsz)ConvBertForMaskedLM.get_output_embeddings  s    %%rr   c                     || _         d S r   r  )rp   r_   s     rL   set_output_embeddingsz)ConvBertForMaskedLM.set_output_embeddings  s    !0rr   Nrs   r   rV   rT   r   rt   labelsr   r=  r>  ru   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Pt          j                    } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r"   r   losslogitsr   rD  )r:   r  r   r  r  r   r	   r   r\   r   r   rD  )rp   rs   r   rV   rT   r   rt   r  r   r=  r>  r  generator_sequence_outputprediction_scoresr  loss_fctr   s                    rL   r   zConvBertForMaskedLM.forward  s-   ( &1%<kk$+B]"&-- 
#
 
#
 %<A$>! 667PQQ 223DEE*,,H8-222t{7MNNPVP[P[\^P_P_``D 	F'),CABB,GGF)-)9TGf$$vE$1?.9	
 
 
 	
rr   
NNNNNNNNNN)r   r   r   _tied_weights_keysrZ   r  r  r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r  r  s^       45    & & &1 1 1  156:59371559-1,0/3&*4
 4
E,-4
 !!234
 !!12	4

 u/04
 E-.4
   124
 )*4
 $D>4
 'tn4
 d^4
 
un$	%4
 4
 4
 ^4
 4
 4
 4
 4
rr   r  c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                 B   t                                                       t          j        |j        |j                  | _        |j        |j        n|j        }t          j        |          | _	        t          j        |j        |j
                  | _        || _        d S r   )rY   rZ   r   r   r   r   classifier_dropoutrg   rf   rh   rb  out_projr:   rp   r:   r  rq   s      rL   rZ   z#ConvBertClassificationHead.__init__  s    Yv163EFF
)/)B)NF%%TZTn 	 z"455	&"4f6GHHrr   r   ru   c                 
   |d d dd d f         }|                      |          }|                     |          }t          | j        j                 |          }|                      |          }|                     |          }|S )Nr   )rh   r   r   r:   r  r  )rp   r   r   r   s       rL   r   z"ConvBertClassificationHead.forward  st    !!!Q'"LLOOJJqMM4;)*1--LLOOMM!rr   r   r   s   @rL   r  r    sd        77	 	 	 	 	U\         rr   r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eef         fd            Z xZS )!ConvBertForSequenceClassificationc                     t                                          |           |j        | _        || _        t	          |          | _        t          |          | _        |                                  d S r   )	rY   rZ   rb  r:   ro  r   r  
classifierrs  ro   s     rL   rZ   z*ConvBertForSequenceClassification.__init__  sb        +%f--4V<< 	rr   Nrs   r   rV   rT   r   rt   r  r   r=  r>  ru   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rV   rT   r   rt   r   r=  r>  r   r   
regressionsingle_label_classificationmulti_label_classificationr"   r  )r:   r  r   r  problem_typerb  rX   r2   rn   r   r
   rm  r	   r   r   r   r   rD  rp   rs   r   rV   rT   r   rt   r  r   r=  r>  r   sequence_outputr  r  r  r   s                    rL   r   z)ConvBertForSequenceClassification.forward  s   ( &1%<kk$+B]--))%'/!5#   

 

 "!*11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
rr   r  )r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    sL             156:59371559-1,0/3&*D
 D
E,-D
 !!23D
 !!12	D

 u/0D
 E-.D
   12D
 )*D
 $D>D
 'tnD
 d^D
 
u..	/D
 D
 D
 ^D
 D
 D
 D
 D
rr   r  c                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eef         fd            Z xZS )ConvBertForMultipleChoicec                     t                                          |           t          |          | _        t	          |          | _        t          j        |j        d          | _	        | 
                                 d S )Nr   )rY   rZ   ro  r   rT  sequence_summaryr   r   r   r  rs  ro   s     rL   rZ   z"ConvBertForMultipleChoice.__init__1  sh       %f-- 7 ? ?)F$6:: 	rr   Nrs   r   rV   rT   r   rt   r  r   r=  r>  ru   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r"   r   r  r   r  )r:   r  rA   r   rm   r   r  r  r	   r   r   rD  )rp   rs   r   rV   rT   r   rt   r  r   r=  r>  num_choicesr   r  pooled_outputr  reshaped_logitsr  r  r   s                       rL   r   z!ConvBertForMultipleChoice.forward;  s-   Z &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 --))%'/!5#   

 

 "!*--o>>// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
rr   r  )r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r  /  sL             156:59371559-1,0/3&*Y
 Y
E,-Y
 !!23Y
 !!12	Y

 u/0Y
 E-.Y
   12Y
 )*Y
 $D>Y
 'tnY
 d^Y
 
u//	0Y
 Y
 Y
 ^Y
 Y
 Y
 Y
 Y
rr   r  c                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eef         fd            Z xZS )ConvBertForTokenClassificationc                 V   t                                          |           |j        | _        t          |          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        |j                  | _        |                                  d S r   )rY   rZ   rb  ro  r   r  rg   r   rf   rh   r   r   r  rs  r  s      rL   rZ   z'ConvBertForTokenClassification.__init__  s        +%f--)/)B)NF%%TZTn 	 z"455)F$68IJJ 	rr   Nrs   r   rV   rT   r   rt   r  r   r=  r>  ru   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r"   r   r  )r:   r  r   rh   r  r	   r   rb  r   r   rD  r  s                    rL   r   z&ConvBertForTokenClassification.forward  s   $ &1%<kk$+B]--))%'/!5#   

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
rr   r  )r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    s8             156:59371559-1,0/3&*2
 2
E,-2
 !!232
 !!12	2

 u/02
 E-.2
   122
 )*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
 2
 ^2
 2
 2
 2
 2
rr   r  c                   b    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         de
eef         fd            Z xZS )ConvBertForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S r   )
rY   rZ   rb  ro  r   r   r   r   
qa_outputsrs  ro   s     rL   rZ   z%ConvBertForQuestionAnswering.__init__  se        +%f--)F$68IJJ 	rr   Nrs   r   rV   rT   r   rt   start_positionsend_positionsr   r=  r>  ru   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nr  r   r   r"   r   )ignore_indexr!   )r  start_logits
end_logitsr   rD  )r:   r  r   r  splitrm  r   r  rm   clampr	   r   r   rD  )rp   rs   r   rV   rT   r   rt   r  r  r   r=  r>  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rL   r   z$ConvBertForQuestionAnswering.forward  s    &1%<kk$+B]--))%'/!5#   

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
rr   )NNNNNNNNNNN)r   r   r   rZ   r   r   r2   r   r   r   r   r   r   r   r   r   s   @rL   r  r    sM             156:593715596:48,0/3&*>
 >
E,->
 !!23>
 !!12	>

 u/0>
 E-.>
   12>
 "%"23>
   01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
 >
 ^>
 >
 >
 >
 >
rr   r  )	r  r  r  r  r  r!  ro  r   rM   )Ar   r   r'   operatorr   typingr   r   r   r2   r   torch.nnr   r	   r
   activationsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_convbertr   
get_loggerr   r%   rM   ModulerO   r   r   r   r   r   r   r  r  r!  r5  rO  rT  ro  r  r  r  r  r  r  r  __all__r   rr   rL   <module>r     s      				       , , , , , , , , , ,        A A A A A A A A A A 1 1 1 1 1 1 1 1 9 9 9 9 9 9                . - - - - - l l l l l l l l l l        3 2 2 2 2 2 
	H	%	%y y yx9 9 9 9 9 9 9 9x % % % % %o % % %8    bi   4~ ~ ~ ~ ~BI ~ ~ ~B       * * * * *	 * * *Z       ,    29   (    RY   &: : : : :. : : :z7
 7
 7
 7
 7
bi 7
 7
 7
t    bi   $` ` ` ` `bi ` ` `F X X X X X+ X X Xv    29   $ H
 H
 H
 H
 H
1 H
 H
 H
V       0   P
 P
 P
 P
 P
(? P
 P
 P
f e
 e
 e
 e
 e
 7 e
 e
 e
P B
 B
 B
 B
 B
%< B
 B
 B
J J
 J
 J
 J
 J
#: J
 J
 J
Z
 
 
rr   