
     `iR                        d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ  ej        e          Z G d dej                  Z  G d dej                  Z! G d dej"                  Z# G d dej                  Z$ G d dej                  Z% G d dej                  Z& G d dej                  Z' G d dej                  Z( G d dej                  Z) G d d ej                  Z* G d! d"ej                  Z+ G d# d$ej                  Z,e G d% d&e                      Z-e G d' d(e-                      Z.e G d) d*e-                      Z/ ed+,           G d- d.e-                      Z0e G d/ d0e-                      Z1e G d1 d2e-                      Z2e G d3 d4e-                      Z3g d5Z4dS )6zPyTorch SqueezeBert model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )SqueezeBertConfigc                   *     e Zd ZdZ fdZddZ xZS )SqueezeBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 8   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt'          j        |j                                      d          d           d S )N)padding_idxepsposition_ids)r   F)
persistent)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormhidden_sizelayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/squeezebert/modeling_squeezebert.pyr!   zSqueezeBertEmbeddings.__init__0   s    !|F,=v?Tbhbuvvv#%<0NPVPe#f#f %'\&2H&J_%`%`" f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
    Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|+t          j        |t          j        | j        j                  }||                     |          }|                     |          }|                     |          }||z   |z   }	| 	                    |	          }	| 
                    |	          }	|	S )Nr   r   dtypedevice)sizer   r2   zeroslongr>   r&   r(   r*   r+   r0   )
r6   	input_idstoken_type_idsr   inputs_embedsinput_shape
seq_lengthr(   r*   
embeddingss
             r9   forwardzSqueezeBertEmbeddings.forward@   s     #..**KK',,..ss3K ^
,QQQ^<L!"[EJtO`OghhhN  00;;M"66|DD $ : :> J J"%88;PP
^^J//
\\*--
r:   )NNNN__name__
__module____qualname____doc__r!   rH   __classcell__r8   s   @r9   r   r   -   sR        QQ
 
 
 
 
        r:   r   c                   (     e Zd ZdZ fdZd Z xZS )MatMulWrapperz
    Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call
    torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul.
    c                 H    t                                                       d S N)r    r!   )r6   r8   s    r9   r!   zMatMulWrapper.__init___   s    r:   c                 ,    t          j        ||          S )a0  

        :param inputs: two torch tensors :return: matmul of these tensors

        Here are the typical dimensions found in BERT (the B is optional) mat1.shape: [B, <optional extra dims>, M, K]
        mat2.shape: [B, <optional extra dims>, K, N] output shape: [B, <optional extra dims>, M, N]
        )r2   matmul)r6   mat1mat2s      r9   rH   zMatMulWrapper.forwardb   s     |D$'''r:   rI   rO   s   @r9   rQ   rQ   Y   sQ         
    ( ( ( ( ( ( (r:   rQ   c                        e Zd ZdZddZd ZdS )SqueezeBertLayerNormz
    This is a nn.LayerNorm subclass that accepts NCW data layout and performs normalization in the C dimension.

    N = batch C = channels W = sequence length
    -q=c                 J    t           j                            | ||           d S )N)normalized_shaper   )r   r+   r!   )r6   r,   r   s      r9   r!   zSqueezeBertLayerNorm.__init__t   s%    
d[cJJJJJr:   c                     |                     ddd          }t          j                            | |          }|                     ddd          S )Nr      r   )permuter   r+   rH   )r6   xs     r9   rH   zSqueezeBertLayerNorm.forwardw   sD    IIaAL  q))yyAq!!!r:   N)rZ   )rJ   rK   rL   rM   r!   rH    r:   r9   rY   rY   m   sE         K K K K" " " " "r:   rY   c                   (     e Zd ZdZ fdZd Z xZS )ConvDropoutLayerNormz8
    ConvDropoutLayerNorm: Conv, Dropout, LayerNorm
    c                     t                                                       t          j        ||d|          | _        t          |          | _        t          j        |          | _        d S Nr   in_channelsout_channelskernel_sizegroups)	r    r!   r   Conv1dconv1drY   	layernormr.   r0   )r6   cincoutrj   dropout_probr8   s        r9   r!   zConvDropoutLayerNorm.__init__   sY    iCdPQZ`aaa-d33z,//r:   c                     |                      |          }|                     |          }||z   }|                     |          }|S rS   )rl   r0   rm   )r6   hidden_statesinput_tensorr`   s       r9   rH   zConvDropoutLayerNorm.forward   sB    KK&&LLOONN1r:   rI   rO   s   @r9   rc   rc   }   sQ         0 0 0 0 0      r:   rc   c                   (     e Zd ZdZ fdZd Z xZS )ConvActivationz*
    ConvActivation: Conv, Activation
    c                     t                                                       t          j        ||d|          | _        t
          |         | _        d S re   )r    r!   r   rk   rl   r
   act)r6   rn   ro   rj   rw   r8   s        r9   r!   zConvActivation.__init__   sD    iCdPQZ`aaa#;r:   c                 V    |                      |          }|                     |          S rS   )rl   rw   )r6   r`   outputs      r9   rH   zConvActivation.forward   s#    Qxxr:   rI   rO   s   @r9   ru   ru      sQ             
             r:   ru   c                   8     e Zd Zd fd	Zd Zd Zd Zd Z xZS )SqueezeBertSelfAttentionr   c                 |   t                                                       ||j        z  dk    rt          d| d|j         d          |j        | _        t	          ||j        z            | _        | j        | j        z  | _        t          j        ||d|          | _	        t          j        ||d|          | _
        t          j        ||d|          | _        t          j        |j                  | _        t          j        d          | _        t#                      | _        t#                      | _        d	S )
z
        config = used for some things; ignored for others (work in progress...) cin = input channels = output channels
        groups = number of groups to use in conv1d layers
        r   zcin (z6) is not a multiple of the number of attention heads ()r   rf   r   dimN)r    r!   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rk   querykeyvaluer.   attention_probs_dropout_probr0   SoftmaxsoftmaxrQ   	matmul_qk
matmul_qkv)r6   r7   rn   q_groupsk_groupsv_groupsr8   s         r9   r!   z!SqueezeBertSelfAttention.__init__   s%   
 	++q00pppSYSmppp   $*#= #&sV-G'G#H#H !58PPY3SaX`aaa
93AV^___Y3SaX`aaa
z&"EFFzb)))&'//r:   c                     |                                 d         | j        | j        |                                 d         f} |j        | }|                    dddd          S )z
        - input: [N, C, W]
        - output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
        r   r   r   r	   r^   )r?   r   r   viewr_   r6   r`   new_x_shapes      r9   transpose_for_scoresz-SqueezeBertSelfAttention.transpose_for_scores   s]    
 vvxx{D$<d>VXYX^X^X`X`acXdeAFK yyAq!$$$r:   c                     |                                 d         | j        | j        |                                 d         f} |j        | }|S )z
        - input: [N, C, W]
        - output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
        r   r   )r?   r   r   r   r   s      r9   transpose_key_for_scoresz1SqueezeBertSelfAttention.transpose_key_for_scores   sJ    
 vvxx{D$<d>VXYX^X^X`X`acXdeAFK r:   c                     |                     dddd                                          }|                                d         | j        |                                d         f} |j        | }|S )zE
        - input: [N, C1, W, C2]
        - output: [N, C, W]
        r   r   r	   r^   )r_   
contiguousr?   r   r   r   s      r9   transpose_outputz)SqueezeBertSelfAttention.transpose_output   sa    
 IIaAq!!,,..vvxx{D$6DAFK r:   c                 0   |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }	|                     ||          }
|
t          j        | j                  z  }
|
|z   }
| 	                    |
          }| 
                    |          }|                     ||	          }|                     |          }d|i}|r|
|d<   |S )z
        expects hidden_states in [N, C, W] data layout.

        The attention_mask data layout is [N, W], and it does not need to be transposed.
        context_layerattention_score)r   r   r   r   r   r   mathsqrtr   r   r0   r   r   )r6   rr   attention_maskoutput_attentionsmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerr   attention_probsr   results                 r9   rH   z SqueezeBertSelfAttention.forward   s    !JJ}55((=11 JJ}55//0ABB11/BB	//0ABB ..i@@)DId6N,O,OO)N: ,,77 ,,77EE--m<<!=1 	8(7F$%r:   )r   r   r   )	rJ   rK   rL   r!   r   r   r   rH   rN   rO   s   @r9   r{   r{      sy        * * * * * *0% % %    ! ! ! ! ! ! !r:   r{   c                   $     e Zd Z fdZd Z xZS )SqueezeBertModulec                    t                                                       |j        }|j        }|j        }|j        }t	          |||j        |j        |j                  | _        t          |||j
        |j                  | _        t          |||j        |j                  | _        t          |||j        |j                  | _        dS )a  
        - hidden_size = input chans = output chans for Q, K, V (they are all the same ... for now) = output chans for
          the module
        - intermediate_size = output chans for intermediate layer
        - groups = number of groups for all layers in the BertModule. (eventually we could change the interface to
          allow different groups for different layers)
        )r7   rn   r   r   r   )rn   ro   rj   rp   )rn   ro   rj   rw   N)r    r!   r,   intermediate_sizer{   r   r   r   	attentionrc   post_attention_groupsr/   post_attentionru   intermediate_groups
hidden_actintermediateoutput_groupsry   )r6   r7   c0c1c2c3r8   s         r9   r!   zSqueezeBertModule.__init__   s     	%1rFOfo`f`o
 
 
 3F$@vOi
 
 
 +r6C]cictuuu*F$8vGa
 
 
r:   c                     |                      |||          }|d         }|                     ||          }|                     |          }|                     ||          }d|i}	|r|d         |	d<   |	S )Nr   feature_mapr   )r   r   r   ry   )
r6   rr   r   r   attattention_outputpost_attention_outputintermediate_outputlayer_outputoutput_dicts
             r9   rH   zSqueezeBertModule.forward  s    nn]N<MNN/ $ 3 34Dm T T"//0EFF{{#68MNN$l3 	D-01B-CK)*r:   rJ   rK   rL   r!   rH   rN   rO   s   @r9   r   r      sG        
 
 
 
 
4      r:   r   c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )SqueezeBertEncoderc                     t                                                       j        j        k    s
J d            t	          j        fdt          j                  D                       | _        d S )NzIf you want embedding_size != intermediate hidden_size, please insert a Conv1d layer to adjust the number of channels before the first SqueezeBertModule.c              3   6   K   | ]}t                    V  d S rS   )r   ).0_r7   s     r9   	<genexpr>z.SqueezeBertEncoder.__init__.<locals>.<genexpr>.  s,      #g#g!$5f$=$=#g#g#g#g#g#gr:   )	r    r!   r$   r,   r   
ModuleListrangenum_hidden_layerslayersr5   s    `r9   r!   zSqueezeBertEncoder.__init__%  su    $(::::2 ;:: m#g#g#g#guVMeGfGf#g#g#gggr:   NFTc                 <   |d}n+|                     d           t          |          k    rd}nd}|du s
J d            |                    ddd          }|rdnd }|rdnd }	| j        D ]e}
|r4|                    ddd          }||fz  }|                    ddd          }|
                    |||          }|d         }|r|	|d	         fz  }	f|                    ddd          }|r||fz  }|st          d
 |||	fD                       S t          |||	          S )NTFzAhead_mask is not yet supported in the SqueezeBert implementation.r   r^   r   ra   r   r   c              3      K   | ]}||V  	d S rS   ra   )r   vs     r9   r   z-SqueezeBertEncoder.forward.<locals>.<genexpr>[  s(      hhqZ[ZgZgZgZgZghhr:   )last_hidden_staterr   
attentions)countlenr_   r   rH   tupler   )r6   rr   r   	head_maskr   output_hidden_statesreturn_dicthead_mask_is_all_noneall_hidden_statesall_attentionslayerr   s               r9   rH   zSqueezeBertEncoder.forward0  s    $(!!__T""c)nn44$(!!$)!$,,,.q,,, &--aA66"6@BBD0:d[ 	E 	EE# ? - 5 5aA > >!m%55! - 5 5aA > > ==HYZZL(7M  E<0A#B"DD &--aA66 	2-!11 	ihh]4E~$Vhhhhhh+;LYg
 
 
 	
r:   )NNFFTr   rO   s   @r9   r   r   $  sb        	h 	h 	h 	h 	h ".
 .
 .
 .
 .
 .
 .
 .
r:   r   c                   $     e Zd Z fdZd Z xZS )SqueezeBertPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S rS   )r    r!   r   Linearr,   denseTanh
activationr5   s     r9   r!   zSqueezeBertPooler.__init__b  sC    Yv163EFF
'))r:   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r   )r6   rr   first_token_tensorpooled_outputs       r9   rH   zSqueezeBertPooler.forwardg  s@     +111a40

#56666r:   r   rO   s   @r9   r   r   a  sG        $ $ $ $ $
      r:   r   c                   $     e Zd Z fdZd Z xZS )"SqueezeBertPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S )Nr   )r    r!   r   r   r,   r   
isinstancer   strr
   transform_act_fnr+   r-   r5   s     r9   r!   z+SqueezeBertPredictionHeadTransform.__init__q  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr:   c                     |                      |          }|                     |          }|                     |          }|S rS   )r   r   r+   r6   rr   s     r9   rH   z*SqueezeBertPredictionHeadTransform.forwardz  s=    

=11--m<<}55r:   r   rO   s   @r9   r   r   p  sL        U U U U U      r:   r   c                   ,     e Zd Z fdZddZd Z xZS )SqueezeBertLMPredictionHeadc                 >   t                                                       t          |          | _        t	          j        |j        |j        d          | _        t	          j	        t          j        |j                            | _        | j        | j        _        d S )NF)bias)r    r!   r   	transformr   r   r,   r#   decoder	Parameterr2   r@   r   r5   s     r9   r!   z$SqueezeBertLMPredictionHead.__init__  sz    ;FCC y!3V5FUSSSLV->!?!?@@	 !Ir:   returnNc                 (    | j         | j        _         d S rS   )r   r   r6   s    r9   _tie_weightsz(SqueezeBertLMPredictionHead._tie_weights  s     Ir:   c                 Z    |                      |          }|                     |          }|S rS   )r   r   r   s     r9   rH   z#SqueezeBertLMPredictionHead.forward  s*    }55]33r:   )r   N)rJ   rK   rL   r!   r   rH   rN   rO   s   @r9   r   r     s[        & & & & && & & &      r:   r   c                   $     e Zd Z fdZd Z xZS )SqueezeBertOnlyMLMHeadc                 p    t                                                       t          |          | _        d S rS   )r    r!   r   predictionsr5   s     r9   r!   zSqueezeBertOnlyMLMHead.__init__  s/    6v>>r:   c                 0    |                      |          }|S rS   )r   )r6   sequence_outputprediction_scoress      r9   rH   zSqueezeBertOnlyMLMHead.forward  s     ,,_==  r:   r   rO   s   @r9   r   r     sG        ? ? ? ? ?! ! ! ! ! ! !r:   r   c                   $    e Zd ZU eed<   dZd ZdS )SqueezeBertPreTrainedModelr7   transformerc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    r |j	        j        
                                 dS dS )zInitialize the weightsg        )meanstdNg      ?)r   r   r   rk   weightdatanormal_r7   initializer_ranger   zero_r"   r   r+   fill_r   )r6   modules     r9   _init_weightsz(SqueezeBertPreTrainedModel._init_weights  s`   fry")455 	% M&&CT[5R&SSS{& &&((((( '&-- 	%M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	%K""$$$M$$S))))) ;<< 	%K""$$$$$	% 	%r:   N)rJ   rK   rL   r   __annotations__base_model_prefixr  ra   r:   r9   r  r    s7         %% % % % %r:   r  c                   <    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j                 dee         dee         dee         deeef         fd            Z xZS )SqueezeBertModelc                     t                                          |           t          |          | _        t	          |          | _        t          |          | _        |                                  d S rS   )	r    r!   r   rG   r   encoderr   pooler	post_initr5   s     r9   r!   zSqueezeBertModel.__init__  s`       /77)&11'// 	r:   c                     | j         j        S rS   rG   r&   r   s    r9   get_input_embeddingsz%SqueezeBertModel.get_input_embeddings  s    ..r:   c                     || j         _        d S rS   r  r6   new_embeddingss     r9   set_input_embeddingsz%SqueezeBertModel.set_input_embeddings  s    *8'''r:   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   prune_heads)r6   heads_to_pruner   headss       r9   _prune_headszSqueezeBertModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr:   NrB   r   rC   r   r   rD   r   r   r   r   c
                 >   ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          ||j        n|j        }|t          j	        |
|          }|!t          j
        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          }|                     ||||||	          }|d         }|                     |          }|	s||f|d	d          z   S t%          |||j        |j        
          S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r>   r<   )rB   r   rC   rD   )rr   r   r   r   r   r   r   r   )r   pooler_outputrr   r   )r7   r   r   use_return_dictr   %warn_if_padding_and_no_attention_maskr?   r>   r2   onesr@   rA   get_extended_attention_maskget_head_maskr   rG   r  r  r   rr   r   )r6   rB   r   rC   r   r   rD   r   r   r   rE   r>   extended_attention_maskembedding_outputencoder_outputsr   r   s                    r9   rH   zSqueezeBertModel.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU%.%:!!@T!"ZFCCCN!"[EJvVVVN"&"B"B>S^"_"_ &&y$+2OPP	??l>iv + 
 
 ,,*2/!5# ' 
 
 *!,O44 	J#]3oabb6III)-')7&1	
 
 
 	
r:   )	NNNNNNNNN)rJ   rK   rL   r!   r  r  r"  r   r   r2   TensorFloatTensorboolr   r   r   rH   rN   rO   s   @r9   r  r    sa           / / /9 9 9C C C  -11515/3,059,0/3&*A
 A
EL)A
 !.A
 !.	A

 u|,A
 EL)A
   12A
 $D>A
 'tnA
 d^A
 
u00	1A
 A
 A
 ^A
 A
 A
 A
 A
r:   r  c                   Z    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee	j
                 dee	j
                 dee	j
                 dee         dee         dee         deeef         fd            Z xZS )SqueezeBertForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S rS   )r    r!   r  r  r   clsr  r5   s     r9   r!   zSqueezeBertForMaskedLM.__init__  sR       +F33)&11 	r:   c                 $    | j         j        j        S rS   )r3  r   r   r   s    r9   get_output_embeddingsz,SqueezeBertForMaskedLM.get_output_embeddings&  s    x#++r:   c                 T    || j         j        _        |j        | j         j        _        d S rS   )r3  r   r   r   r  s     r9   set_output_embeddingsz,SqueezeBertForMaskedLM.set_output_embeddings)  s%    '5$$2$7!!!r:   NrB   r   rC   r   r   rD   labelsr   r   r   r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   rC   r   r   rD   r   r   r   r   r   r^   losslogitsrr   r   )
r7   r%  r  r3  r   r   r#   r   rr   r   )r6   rB   r   rC   r   r   rD   r8  r   r   r   outputsr   r   masked_lm_lossloss_fctry   s                    r9   rH   zSqueezeBertForMaskedLM.forward-  s   ( &1%<kk$+B]""))%'/!5# # 

 

 "!* HH_55'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r:   
NNNNNNNNNN)rJ   rK   rL   _tied_weights_keysr!   r5  r7  r   r   r2   r-  r/  r   r   r   rH   rN   rO   s   @r9   r1  r1    sZ       :<Z[    , , ,8 8 8  -11515/3,004)-,0/3&*2
 2
EL)2
 !.2
 !.	2

 u|,2
 EL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
 2
 ^2
 2
 2
 2
 2
r:   r1  z
    SqueezeBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ef         fd            Z xZS )$SqueezeBertForSequenceClassificationc                 N   t                                          |           |j        | _        || _        t	          |          | _        t          j        |j                  | _	        t          j
        |j        | j        j                  | _        |                                  d S rS   )r    r!   
num_labelsr7   r  r  r   r.   r/   r0   r   r,   
classifierr  r5   s     r9   r!   z-SqueezeBertForSequenceClassification.__init__j  s        ++F33z&"<==)F$68NOO 	r:   NrB   r   rC   r   r   rD   r8  r   r   r   r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t!          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr:  r   
regressionsingle_label_classificationmulti_label_classificationr   r^   r;  )r7   r%  r  r0   rH  problem_typerG  r=   r2   rA   r   r   squeezer   r   r   r   rr   r   )r6   rB   r   rC   r   r   rD   r8  r   r   r   r>  r   r=  r<  r@  ry   s                    r9   rH   z,SqueezeBertForSequenceClassification.forwardv  s   ( &1%<kk$+B]""))%'/!5# # 

 

  
]33//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r:   rA  )rJ   rK   rL   r!   r   r   r2   r-  r/  r   r   r   rH   rN   rO   s   @r9   rE  rE  c  sE       
 
 
 
 
  -11515/3,004)-,0/3&*F
 F
EL)F
 !.F
 !.	F

 u|,F
 EL)F
  -F
 &F
 $D>F
 'tnF
 d^F
 
u..	/F
 F
 F
 ^F
 F
 F
 F
 F
r:   rE  c                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ef         fd            Z xZS )SqueezeBertForMultipleChoicec                    t                                          |           t          |          | _        t	          j        |j                  | _        t	          j        |j	        d          | _
        |                                  d S )Nr   )r    r!   r  r  r   r.   r/   r0   r   r,   rH  r  r5   s     r9   r!   z%SqueezeBertForMultipleChoice.__init__  sm       +F33z&"<==)F$6:: 	r:   NrB   r   rC   r   r   rD   r8  r   r   r   r   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   r   r:  r^   r;  )r7   r%  shaper   r?   r  r0   rH  r   r   rr   r   )r6   rB   r   rC   r   r   rD   r8  r   r   r   num_choicesr>  r   r=  reshaped_logitsr<  r@  ry   s                      r9   rH   z$SqueezeBertForMultipleChoice.forward  s-   X &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ""))%'/!5# # 

 

  
]33// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r:   rA  )rJ   rK   rL   r!   r   r   r2   r-  r/  r   r   r   rH   rN   rO   s   @r9   rP  rP    sE             -11515/3,004)-,0/3&*X
 X
EL)X
 !.X
 !.	X

 u|,X
 EL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
u//	0X
 X
 X
 ^X
 X
 X
 X
 X
r:   rP  c                   F    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ef         fd            Z xZS )!SqueezeBertForTokenClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S rS   )r    r!   rG  r  r  r   r.   r/   r0   r   r,   rH  r  r5   s     r9   r!   z*SqueezeBertForTokenClassification.__init__*  sz        ++F33z&"<==)F$68IJJ 	r:   NrB   r   rC   r   r   rD   r8  r   r   r   r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr:  r   r   r^   r;  )r7   r%  r  r0   rH  r   r   rG  r   rr   r   )r6   rB   r   rC   r   r   rD   r8  r   r   r   r>  r   r=  r<  r@  ry   s                    r9   rH   z)SqueezeBertForTokenClassification.forward5  s   $ &1%<kk$+B]""))%'/!5# # 

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r:   rA  )rJ   rK   rL   r!   r   r   r2   r-  r/  r   r   r   rH   rN   rO   s   @r9   rX  rX  (  s1       	 	 	 	 	  -11515/3,004)-,0/3&*2
 2
EL)2
 !.2
 !.	2

 u|,2
 EL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
 2
 ^2
 2
 2
 2
 2
r:   rX  c                   b    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee         dee         dee         de	e
ef         fd            Z xZS )SqueezeBertForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S rS   )
r    r!   rG  r  r  r   r   r,   
qa_outputsr  r5   s     r9   r!   z(SqueezeBertForQuestionAnswering.__init__m  sf        ++F33)F$68IJJ 	r:   NrB   r   rC   r   r   rD   start_positionsend_positionsr   r   r   r   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nr:  r   r   r   r~   )ignore_indexr^   )r<  start_logits
end_logitsrr   r   )r7   r%  r  r^  splitrN  r   r   r?   clampr   r   rr   r   )r6   rB   r   rC   r   r   rD   r_  r`  r   r   r   r>  r   r=  rc  rd  
total_lossignored_indexr@  
start_lossend_lossry   s                          r9   rH   z'SqueezeBertForQuestionAnswering.forwardw  s    &1%<kk$+B]""))%'/!5# # 

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r:   )NNNNNNNNNNN)rJ   rK   rL   r!   r   r   r2   r-  r/  r   r   r   rH   rN   rO   s   @r9   r\  r\  k  sE             -11515/3,0042604,0/3&*>
 >
EL)>
 !.>
 !.	>

 u|,>
 EL)>
  ->
 "%,/>
  ->
 $D>>
 'tn>
 d^>
 
u22	3>
 >
 >
 ^>
 >
 >
 >
 >
r:   r\  )r1  rP  r\  rE  rX  r  r   r  )5rM   r   typingr   r   r2   r   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_squeezebertr   
get_loggerrJ   loggerModuler   rQ   r+   rY   rc   ru   r{   r   r   r   r   r   r   r  r  r1  rE  rP  rX  r\  __all__ra   r:   r9   <module>rv     s/   !    " " " " " " " "        A A A A A A A A A A ! ! ! ! ! !                  . - - - - -        9 8 8 8 8 8 
	H	%	%) ) ) ) )BI ) ) )X( ( ( ( (BI ( ( ((" " " " "2< " " "     29   (         RY      W W W W Wry W W Wt' ' ' ' '	 ' ' 'T:
 :
 :
 :
 :
 :
 :
 :
z    	          "    ")   .! ! ! ! !RY ! ! ! % % % % % % % %. [
 [
 [
 [
 [
1 [
 [
 [
| F
 F
 F
 F
 F
7 F
 F
 F
R   T
 T
 T
 T
 T
+E T
 T
 T
n d
 d
 d
 d
 d
#= d
 d
 d
N ?
 ?
 ?
 ?
 ?
(B ?
 ?
 ?
D J
 J
 J
 J
 J
&@ J
 J
 J
Z	 	 	r:   