
     `i(                        d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%  ej&        e'          Z( G d dej)                  Z* G d dej)                  Z+ G d dej)                  Z, G d dej)                  Z- G d dej)                  Z. G d dej)                  Z/ G d dej)                  Z0 G d dej)                  Z1 G d d ej)                  Z2e G d! d"e                      Z3e G d# d$e3                      Z4e G d% d&e3                      Z5 G d' d(ej)                  Z6 ed)*           G d+ d,e3                      Z7e G d- d.e3                      Z8e G d/ d0e3                      Z9 G d1 d2ej)                  Z:e G d3 d4e3                      Z;d7d5Z<g d6Z=dS )8zPyTorch I-BERT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                   2     e Zd ZdZ fdZ	 ddZd Z xZS )IBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t                                                       |j        | _        d| _        d| _        d| _        d| _        d| _        t          |j	        |j
        |j        | j        | j                  | _        t          |j        |j
        | j        | j                  | _        |                     dt!          j        |j                                      d          d	
           t)          |dd          | _        |j        | _        t          |j        |j
        | j        | j        | j                  | _        t1          | j        | j                  | _        t1          | j        | j                  | _        t7          |j
        |j        | j        | j        |j                  | _        t1          | j        | j                  | _        tA          j!        |j"                  | _#        d S )N             )padding_idx
weight_bit
quant_mode)r'   r(   position_ids)r   F)
persistentposition_embedding_typeabsoluter(   eps
output_bitr(   force_dequant)$super__init__r(   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandgetattrr,   r&   position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr2   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__s     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/ibert/modeling_ibert.pyr4   zIBertEmbeddings.__init__4   s    +!#-+) 
  
  
 &4"F$64CUbfbq&
 &
 &
"
 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$ ".#1*()$
 $
 $
   ((>4?[[['(>4?[[[ &%) .
 
 
 "*$,4?!S!S!Sz&"<==    Nr   c                    |F|/t          || j        |                              |j                  }n|                     |          }||                                }n|                                d d         }|+t          j        |t          j        | j	        j                  }|| 
                    |          \  }}nd }|                     |          \  }}	|                     ||||	          \  }
}| j        dk    r4|                     |          \  }}|                     |
|||          \  }
}|                     |
|          \  }
}|                     |
          }
|                     |
|          \  }
}|
|fS )Nr*   dtypedeviceidentityidentity_scaling_factorr-   )"create_position_ids_from_input_idsr&   torX   &create_position_ids_from_inputs_embedssizerA   zeroslongr)   r=   r?   rG   r,   rF   rJ   rN   rK   )rP   	input_idstoken_type_idsr)   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorr?   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorrF   "position_embeddings_scaling_factors                 rS   forwardzIBertEmbeddings.forwardh   s    $At/1G   "Y%&&   $JJ=YY #..**KK',,..ss3K!"[EJtO`OghhhN :>:N:Ny:Y:Y7M77+/(FJF`F`aoFpFpCC040D0D(*$H	 1E 1
 1
-
- ':55FJF^F^_kFlFlC!C484H4H),(J	 5I 5 51J1 15zKd0e0e-
-\\*--
040F0FzSl0m0m-
-444rT   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr*   r   rV   r   )r_   rA   rB   r&   ra   rX   	unsqueezerD   )rP   rd   rf   sequence_lengthr)   s        rS   r^   z6IBertEmbeddings.create_position_ids_from_inputs_embeds   s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<rT   )NNNNr   )__name__
__module____qualname____doc__r4   rl   r^   __classcell__rR   s   @rS   r    r    /   sn         2> 2> 2> 2> 2>j rs-5 -5 -5 -5^= = = = = = =rT   r    c                   ,     e Zd Z fdZ	 	 	 ddZ xZS )IBertSelfAttentionc           	         t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        d| _        d| _        d| _	        |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          |j        | j        d| j        | j        | j        d	          | _        t          |j        | j        d| j        | j        | j        d	          | _        t          |j        | j        d| j        | j        | j        d	          | _        t#          | j	        | j        
          | _        t#          | j	        | j        
          | _        t#          | j	        | j        
          | _        t#          | j	        | j        
          | _        t-          j        |j                  | _        t5          |dd          | _        | j        dk    rt          d          t9          | j	        | j        |j                  | _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r"   r%   Tbiasr'   bias_bitr(   per_channelr.   r,   r-   zDI-BERT only supports 'absolute' for `config.position_embedding_type`r(   r2   )r3   r4   r;   num_attention_headshasattr
ValueErrorr(   r'   r}   r7   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrK   r   rL   attention_probs_dropout_probrN   rE   r,   r   r2   softmaxrO   s     rS   r4   zIBertSelfAttention.__init__   sO    ::a??PVXhHiHi?8F$6 8 8 48 8 8   !+#)#= #&v'9F<V'V#W#W !58PP !]
 
 

 ]
 
 
 !]
 
 

 !)$/ R R R&t|PPP ($/ R R R!)$,4?!S!S!Sz&"EFF'.v7PR\']']$':55cddd!$,4?Z`ZnooorT   NFc                     |                      ||          \  }}|                     ||          \  }}	|                     ||          \  }
}|                     ||          \  }}|                     ||	          \  }}|                     |
|          \  }}|j        \  }}}|                    |d| j        | j	                  
                    dd          }|                    |d| j        | j	                  
                    dd          }|                    |d| j        | j	                  
                    dd          }t          j        ||
                    dd                    }t          j        | j	                  }||z  }| j        r	||z  |z  }nd }|||z   }|                     ||          \  }}|                     |          }|||z  }t          j        ||          }|||z  }nd }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|                     ||          \  }}|r||fn|f}|r||fn|f}||fS )Nr*   r      r   r	   )r   r   r   r   r   r   shapeviewr   r   	transposerA   matmulmathsqrtr(   r   rN   permute
contiguousr_   r   rK   )rP   hidden_stateshidden_states_scaling_factorattention_mask	head_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factor
batch_size
seq_length_attention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factors                                  rS   rl   zIBertSelfAttention.forward   s    ?CjjXt>u>u;;:>((=Rn:o:o77>BjjXt>u>u;; 372G2G?3
 3
// /3.A.A/Sq.r.r+	+262G2G?3
 3
//
 %2$7!
J!&&z2t7OQUQijjttq
 
 NN:r43KTMeffppqrtuvv	!&&z2t7OQUQijjttq
 

 !<Y5H5HR5P5PQQ	$233+e3? 	3.HKc.cfk.k++.2+%/.@ ;?,,=;
 ;
77 ,,77  -	9O_kBB)5+ILf+f((+/(%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD 7;6L6L77
 7
33 7H]=/22mM] !1)+IJJ.0 	 ---rT   NNFrp   rq   rr   r4   rl   rt   ru   s   @rS   rw   rw      se        8p 8p 8p 8p 8p| P. P. P. P. P. P. P. P.rT   rw   c                   $     e Zd Z fdZd Z xZS )IBertSelfOutputc           	      .   t                                                       |j        | _        d| _        d| _        d| _        d| _        d| _        t          |j	        |j	        d| j        | j        | j        d          | _
        t          | j        | j                  | _        t          |j	        |j        | j        | j        |j                  | _        t          | j        | j                  | _        t%          j        |j                  | _        d S Nr"   r%   r$   Tr{   r.   r/   )r3   r4   r(   r7   r'   r}   r8   r9   r   r;   denser   ln_input_actr   rI   r2   rJ   rK   r   rL   rM   rN   rO   s     rS   r4   zIBertSelfOutput.__init__8  s    + ]
 
 

 %T%64?SSS%%) .
 
 
 "*$,4?!S!S!Sz&"<==rT   c                    |                      ||          \  }}|                     |          }|                     ||||          \  }}|                     ||          \  }}|                     ||          \  }}||fS NrY   r   rN   r   rJ   rK   rP   r   r   input_tensorinput_tensor_scaling_factors        rS   rl   zIBertSelfOutput.forwardU      6:jjPl6m6m33]336:6G6G(!$?	 7H 7
 7
33 7;nn]Tp6q6q336:6L6L77
 7
33 :::rT   r   ru   s   @rS   r   r   7  G        > > > > >:; ; ; ; ; ; ;rT   r   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )IBertAttentionc                     t                                                       |j        | _        t          |          | _        t          |          | _        t                      | _        d S N)	r3   r4   r(   rw   rP   r   outputsetpruned_headsrO   s     rS   r4   zIBertAttention.__init__g  sS     +&v..	%f--EErT   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   dim)lenr   rP   r   r   r   r   r   r   r   r   r   r   union)rP   headsindexs      rS   prune_headszIBertAttention.prune_headsn  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::rT   NFc                     |                      |||||          \  }}|                     |d         |d         ||          \  }}	|f|dd          z   }
|	f|dd          z   }|
|fS )Nr   r   )rP   r   )rP   r   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factors               rS   rl   zIBertAttention.forward  s     59II(5
 5
11 =AKKO8;]Lh=
 =
99 $%QRR(88"A!CFabcbdbdFe!e...rT   r   )rp   rq   rr   r4   r   rl   rt   ru   s   @rS   r   r   f  sf        " " " " "; ; ;, / / / / / / / /rT   r   c                   $     e Zd Z fdZd Z xZS )IBertIntermediatec           	         t                                                       |j        | _        d| _        d| _        d| _        t          |j        |j        d| j        | j        | j        d          | _	        |j
        dk    rt          d          t          | j        |j                  | _        t          | j        | j                  | _        d S )	Nr"   r%   Tr{   r
   z3I-BERT only supports 'gelu' for `config.hidden_act`r   r.   )r3   r4   r(   r7   r'   r}   r   r;   intermediate_sizer   
hidden_actr   r   r2   intermediate_act_fnr   rK   rO   s     rS   r4   zIBertIntermediate.__init__  s     + $]
 
 

 &&RSSS#*doU[Ui#j#j#j !)$,4?!S!S!SrT   c                     |                      ||          \  }}|                     ||          \  }}|                     ||          \  }}||fS r   )r   r   rK   )rP   r   r   s      rS   rl   zIBertIntermediate.forward  sn    6:jjPl6m6m336:6N6N77
 7
33
 7;6L6L77
 7
33 :::rT   r   ru   s   @rS   r   r     sL        T T T T T(
; 
; 
; 
; 
; 
; 
;rT   r   c                   $     e Zd Z fdZd Z xZS )IBertOutputc           	      .   t                                                       |j        | _        d| _        d| _        d| _        d| _        d| _        t          |j	        |j
        d| j        | j        | j        d          | _        t          | j        | j                  | _        t          |j
        |j        | j        | j        |j                  | _        t          | j        | j                  | _        t'          j        |j                  | _        d S r   )r3   r4   r(   r7   r'   r}   r8   r9   r   r   r;   r   r   r   r   rI   r2   rJ   rK   r   rL   rM   rN   rO   s     rS   r4   zIBertOutput.__init__  s    + $]
 
 

 %T%64?SSS%%) .
 
 
 "*$,4?!S!S!Sz&"<==rT   c                    |                      ||          \  }}|                     |          }|                     ||||          \  }}|                     ||          \  }}|                     ||          \  }}||fS r   r   r   s        rS   rl   zIBertOutput.forward  r   rT   r   ru   s   @rS   r   r     r   rT   r   c                   2     e Zd Z fdZ	 	 	 ddZd Z xZS )
IBertLayerc                 t   t                                                       |j        | _        d| _        d| _        t          |          | _        t          |          | _        t          |          | _
        t          | j        | j                  | _        t          | j        | j                  | _        d S )Nr"   r   r.   )r3   r4   r(   r7   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrO   s     rS   r4   zIBertLayer.__init__  s     +'//-f55!&))$,T\do$V$V$V!&t|PPPrT   NFc                     |                      |||||          \  }}|d         }|d         }	|dd          }
|                     ||	          \  }}|f|
z   }
|
S )N)r   r   r   )r   feed_forward_chunk)rP   r   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factors                rS   rl   zIBertLayer.forward  s     IM(/ IW I
 I
E E 2!4*OPQ*R'(,484K4K=5
 5
11  /G+rT   c                     |                      ||          \  }}|                     ||          \  }}|                     ||          \  }}|                     ||||          \  }}||fS r   )r   r   r   r   )rP   r   r   intermediate_output"intermediate_output_scaling_factorr   r   s          rS   r   zIBertLayer.feed_forward_chunk  s    <@<U<U==
 =
99 CGBSBS=C
 C
?? CGBUBU!CC
 C
?? 59KK!CEUWv5
 5
11 888rT   r   )rp   rq   rr   r4   rl   r   rt   ru   s   @rS   r   r     sk        Q Q Q Q Q"    69 9 9 9 9 9 9rT   r   c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )IBertEncoderc                     t                                                       | _        j        | _        t	          j        fdt          j                  D                       | _        d S )Nc                 .    g | ]}t                    S  )r   ).0r   rQ   s     rS   
<listcomp>z)IBertEncoder.__init__.<locals>.<listcomp>'  s!    #`#`#`1Jv$6$6#`#`#`rT   )	r3   r4   rQ   r(   r   
ModuleListrangenum_hidden_layerslayerrO   s    `rS   r4   zIBertEncoder.__init__#  s`     +]#`#`#`#`fF^@_@_#`#`#`aa


rT   NFTc                 8   |rdnd }|rdnd }	d }
t          | j                  D ]>\  }}|r||fz   }|||         nd } ||||||          }|d         }|r|	|d         fz   }	?|r||fz   }|st          d |||	|
fD                       S t          |||	|
          S )Nr   r   r   c              3      K   | ]}||V  	d S r   r   )r   vs     rS   	<genexpr>z'IBertEncoder.forward.<locals>.<genexpr>M  s4       	 	 =  !===	 	rT   )last_hidden_stater   
attentionscross_attentions)	enumerater  tupler   )rP   r   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                  rS   rl   zIBertEncoder.forward)  s;    #7@BBD$5?bb4#(44 	P 	POA|# I$58H$H!.7.CillO(L,! M *!,M  P&9]1=M<O&O# 	E 1]4D D 
	 	 	 "%'(		 	 	 	 	 	 9++*1	
 
 
 	
rT   )NNFFTr   ru   s   @rS   r   r   "  sb        b b b b b "3
 3
 3
 3
 3
 3
 3
 3
rT   r   c                   $     e Zd Z fdZd Z xZS )IBertPoolerc                     t                                                       |j        | _        t          j        |j        |j                  | _        t          j                    | _        d S r   )	r3   r4   r(   r   Linearr;   r   Tanh
activationrO   s     rS   r4   zIBertPooler.__init__`  sM     +Yv163EFF
'))rT   c                 r    |d d df         }|                      |          }|                     |          }|S Nr   )r   r  )rP   r   first_token_tensorpooled_outputs       rS   rl   zIBertPooler.forwardf  s@     +111a40

#56666rT   r   ru   s   @rS   r  r  _  sG        $ $ $ $ $      rT   r  c                   ,    e Zd ZU eed<   dZd ZddZdS )IBertPreTrainedModelrQ   ibertc                    t          |t          t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          t          j        f          r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          t          j        f          r?|j	        j        
                                 |j        j                            d           dS t          |t"                    r |j	        j        
                                 dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   r   r  weightdatanormal_rQ   initializer_ranger|   zero_r   	Embeddingr&   r   rJ   fill_IBertLMHead)rP   modules     rS   _init_weightsz"IBertPreTrainedModel._init_weightst  sg   f{BI677 	% M&&CT[5R&SSS{& &&((((( '& >?? 	%M&&CT[5R&SSS!-"6#56<<>>>>> .-r| <== 	%K""$$$M$$S))))),, 	%K""$$$$$	% 	%rT   Nc                      t          d          )Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rP   new_num_tokenss     rS   resize_token_embeddingsz,IBertPreTrainedModel.resize_token_embeddings  s    !"Z[[[rT   r   )rp   rq   rr   r   __annotations__base_model_prefixr/  r3  r   rT   rS   r   r   o  sQ         % % %$\ \ \ \ \ \rT   r   c                   X    e Zd ZdZd fd	Zd Zd Zd Ze	 	 	 	 	 	 	 	 	 dde	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         deeee
j                 f         fd            Z xZS )
IBertModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    Tc                    t                                          |           || _        |j        | _        t	          |          | _        t          |          | _        |rt          |          nd| _	        | 
                                 dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r3   r4   rQ   r(   r    ri   r   encoderr  pooler	post_init)rP   rQ   add_pooling_layerrR   s      rS   r4   zIBertModel.__init__  s}    
 	    +)&11#F++->Hk&)))D 	rT   c                     | j         j        S r   ri   r=   rP   s    rS   get_input_embeddingszIBertModel.get_input_embeddings  s    ..rT   c                     || j         _        d S r   r>  )rP   r   s     rS   set_input_embeddingszIBertModel.set_input_embeddings  s    */'''rT   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr9  r  r   r   )rP   heads_to_pruner  r   s       rS   _prune_headszIBertModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	CrT   Nrb   r   rc   r)   r   rd   r   r  r  returnc
           	      r   ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        ||f|          }|!t          j
        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          \  }}|                     |||||||	          }|d         }| j        |                     |          nd }|	s||f|d	d          z   S t%          |||j        |j        |j        
          S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer*   z5You have to specify either input_ids or inputs_embeds)rX   rV   )rb   r)   rc   rd   )r   r   r   r  r  r   r   )r  pooler_outputr   r  r	  )rQ   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr_   rX   rA   onesr`   ra   get_extended_attention_maskget_head_maskr  ri   r9  r:  r   r   r  r	  )rP   rb   r   rc   r)   r   rd   r   r  r  rf   r   r   rX   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr  s                       rS   rl   zIBertModel.forward  s*    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"Z*j)A6RRRN!"[EJvVVVN 150P0PQ_al0m0m &&y$+2OPP	<@OO%)'	 =L =
 =
99 ,,+2/!5# ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-')7&1,=
 
 
 	
rT   )T)	NNNNNNNNN)rp   rq   rr   rs   r4   r@  rB  rF  r   r   rA   
LongTensorFloatTensorboolr   r   r  rl   rt   ru   s   @rS   r7  r7    s}             "/ / /0 0 0C C C  156:59371559,0/3&*J
 J
E,-J
 !!23J
 !!12	J

 u/0J
 E-.J
   12J
 $D>J
 'tnJ
 d^J
 
;U5CT=UU	VJ
 J
 J
 ^J
 J
 J
 J
 J
rT   r7  c                   p    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j
                 dee	j                 dee	j                 dee	j
                 dee         dee         dee         deeee	j                 f         fd            Z xZS )IBertForMaskedLMzlm_head.decoder.biaszlm_head.decoder.weightc                     t                                          |           t          |d          | _        t	          |          | _        |                                  d S NF)r<  )r3   r4   r7  r!  r-  lm_headr;  rO   s     rS   r4   zIBertForMaskedLM.__init__  sV       %@@@
"6** 	rT   c                     | j         j        S r   )r[  decoderr?  s    rS   get_output_embeddingsz&IBertForMaskedLM.get_output_embeddings  s    |##rT   c                 @    || j         _        |j        | j         _        d S r   )r[  r]  r|   )rP   new_embeddingss     rS   set_output_embeddingsz&IBertForMaskedLM.set_output_embeddings  s    -*/rT   Nrb   r   rc   r)   r   rd   labelsr   r  r  rG  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   rc   r)   r   rd   r   r  r  r   r*   r   losslogitsr   r  )
rQ   rJ  r!  r[  r   r   r:   r   r   r  )rP   rb   r   rc   r)   r   rd   rb  r   r  r  r   rS  prediction_scoresmasked_lm_lossloss_fctr   s                    rS   rl   zIBertForMaskedLM.forward  s   ( &1%<kk$+B]**))%'/!5#  

 

 "!* LL99'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
rT   
NNNNNNNNNN)rp   rq   rr   _tied_weights_keysr4   r^  ra  r   r   rA   rT  rU  rV  r   r   r  rl   rt   ru   s   @rS   rX  rX    si       02JK    $ $ $0 0 0  156:59371559-1,0/3&*1
 1
E,-1
 !!231
 !!12	1

 u/01
 E-.1
   121
 )*1
 $D>1
 'tn1
 d^1
 
~uU%677	81
 1
 1
 ^1
 1
 1
 1
 1
rT   rX  c                   0     e Zd ZdZ fdZd ZddZ xZS )r-  z)I-BERT Head for masked language modeling.c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j	                  | _
        t          j        t          j        |j	                            | _        | j        | j
        _        d S )N)r0   )r3   r4   r   r  r;   r   rJ   rI   
layer_normr:   r]  	ParameterrA   r`   r|   rO   s     rS   r4   zIBertLMHead.__init__N  s    Yv163EFF
,v'9v?TUUUy!3V5FGGLV->!?!?@@	 IrT   c                     |                      |          }t          |          }|                     |          }|                     |          }|S r   )r   r
   ro  r]  )rP   featureskwargsxs       rS   rl   zIBertLMHead.forwardW  sE    JJx  GGOOA LLOOrT   rG  Nc                     | j         j        j        j        dk    r| j        | j         _        d S | j         j        | _        d S )Nmeta)r]  r|   rX   typer?  s    rS   _tie_weightszIBertLMHead._tie_weightsa  s<    <#(F22 $	DL )DIIIrT   )rG  N)rp   rq   rr   rs   r4   rl   rx  rt   ru   s   @rS   r-  r-  K  sa        33& & & & &  * * * * * * * *rT   r-  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eeej                 f         fd            Z xZS )IBertForSequenceClassificationc                     t                                          |           |j        | _        t          |d          | _        t          |          | _        |                                  d S rZ  )r3   r4   
num_labelsr7  r!  IBertClassificationHead
classifierr;  rO   s     rS   r4   z'IBertForSequenceClassification.__init__q  s`        +%@@@
1&99 	rT   Nrb   r   rc   r)   r   rd   rb  r   r  r  rG  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|d	d         z   }||f|z   n|S t          |||j        |j        
          S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrd  r   r   
regressionsingle_label_classificationmulti_label_classificationr*   r   re  )rQ   rJ  r!  r  problem_typer}  rW   rA   ra   r   r   squeezer   r   r   r   r   r  rP   rb   r   rc   r)   r   rd   rb  r   r  r  r   rS  rg  rf  rj  r   s                    rS   rl   z&IBertForSequenceClassification.forward{  s   ( &1%<kk$+B]**))%'/!5#  

 

 "!*11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
rT   rk  )rp   rq   rr   r4   r   r   rA   rT  rU  rV  r   r   r  rl   rt   ru   s   @rS   r{  r{  j  sT             156:59371559-1,0/3&*B
 B
E,-B
 !!23B
 !!12	B

 u/0B
 E-.B
   12B
 )*B
 $D>B
 'tnB
 d^B
 
'u/@)AA	BB
 B
 B
 ^B
 B
 B
 B
 B
rT   r{  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eeej                 f         fd            Z xZS )IBertForMultipleChoicec                    t                                          |           t          |          | _        t	          j        |j                  | _        t	          j        |j	        d          | _
        |                                  d S )Nr   )r3   r4   r7  r!  r   rL   rM   rN   r  r;   r  r;  rO   s     rS   r4   zIBertForMultipleChoice.__init__  sl       ''
z&"<==)F$6:: 	rT   Nrb   rc   r   rb  r)   r   rd   r   r  r  rG  c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r*   r   )r)   rc   r   r   rd   r   r  r  r   re  )rQ   rJ  r   r   r_   r!  rN   r  r   r   r   r  )rP   rb   rc   r   rb  r)   r   rd   r   r  r  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r  rg  reshaped_logitsrf  rj  r   s                           rS   rl   zIBertForMultipleChoice.forward  s.   X &1%<kk$+B],5,Aioa((}GZ[\G]CLCXINN2,>,>???^bLXLdL--b,2C2CB2G2GHHHjnR`Rln11"n6I6I"6M6MNNNrvR`Rln11"n6I6I"6M6MNNNrv ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ***..,/!5#  

 

  
]33// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
rT   rk  )rp   rq   rr   r4   r   r   rA   rT  rU  rV  r   r   r  rl   rt   ru   s   @rS   r  r    sT             15596:-1371559,0/3&*W
 W
E,-W
 !!12W
 !!23	W

 )*W
 u/0W
 E-.W
   12W
 $D>W
 'tnW
 d^W
 
(%0A*BB	CW
 W
 W
 ^W
 W
 W
 W
 W
rT   r  c                   \    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
ee	         dee	         dee	         de
eeej                 f         fd            Z xZS )IBertForTokenClassificationc                 :   t                                          |           |j        | _        t          |d          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S rZ  )r3   r4   r}  r7  r!  r   rL   rM   rN   r  r;   r  r;  rO   s     rS   r4   z$IBertForTokenClassification.__init__*  s~        +%@@@
z&"<==)F$68IJJ 	rT   Nrb   r   rc   r)   r   rd   rb  r   r  r  rG  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrd  r   r*   r   re  )rQ   rJ  r!  rN   r  r   r   r}  r   r   r  r  s                    rS   rl   z#IBertForTokenClassification.forward5  s   $ &1%<kk$+B]**))%'/!5#  

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
rT   rk  )rp   rq   rr   r4   r   r   rA   rT  rU  rV  r   r   r  rl   rt   ru   s   @rS   r  r  (  s@       	 	 	 	 	  156:59371559-1,0/3&*2
 2
E,-2
 !!232
 !!12	2

 u/02
 E-.2
   122
 )*2
 $D>2
 'tn2
 d^2
 
$eE,=&>>	?2
 2
 2
 ^2
 2
 2
 2
 2
rT   r  c                   (     e Zd ZdZ fdZd Z xZS )r~  z-Head for sentence-level classification tasks.c                    t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        t          j        |j        |j	                  | _
        d S r   )r3   r4   r   r  r;   r   rL   rM   rN   r}  out_projrO   s     rS   r4   z IBertClassificationHead.__init__n  sc    Yv163EFF
z&"<==	&"4f6GHHrT   c                     |d d dd d f         }|                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S r  )rN   r   rA   tanhr  )rP   rr  rs  r   s       rS   rl   zIBertClassificationHead.forwardt  ss     Aqqq)]33

=11
=11]33m44rT   )rp   rq   rr   rs   r4   rl   rt   ru   s   @rS   r~  r~  k  sR        77I I I I I      rT   r~  c                   x    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         de
eeej                 f         fd            Z xZS )IBertForQuestionAnsweringc                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S rZ  )
r3   r4   r}  r7  r!  r   r  r;   
qa_outputsr;  rO   s     rS   r4   z"IBertForQuestionAnswering.__init__  sj        +%@@@
)F$68IJJ 	rT   Nrb   r   rc   r)   r   rd   start_positionsend_positionsr   r  r  rG  c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d }||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd          z   }||f|z   n|S t          ||||j        |j                  S )	Nrd  r   r   r*   r   )ignore_indexr   )rf  start_logits
end_logitsr   r  )rQ   rJ  r!  r  splitr  r   r   r_   clampr   r   r   r  )rP   rb   r   rc   r)   r   rd   r  r  r   r  r  r   rS  rg  r  r  
total_lossignored_indexrj  
start_lossend_lossr   s                          rS   rl   z!IBertForQuestionAnswering.forward  s    &1%<kk$+B]**))%'/!5#  

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
rT   )NNNNNNNNNNN)rp   rq   rr   r4   r   r   rA   rT  rU  rV  r   r   r  rl   rt   ru   s   @rS   r  r  ~  sU             156:593715596:48,0/3&*>
 >
E,->
 !!23>
 !!12	>

 u/0>
 E-.>
   12>
 "%"23>
   01>
 $D>>
 'tn>
 d^>
 
+U53D-EE	F>
 >
 >
 ^>
 >
 >
 >
 >
rT   r  c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )aM  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    r   r   )ner   rA   cumsumtype_asra   )rb   r&   re   maskincremental_indicess        rS   r\   r\     sg     <<$$((**D <!444<<TBBE[[_cc##%%33rT   )rX  r  r  r{  r  r7  r   )r   )>rs   r   typingr   r   rA   r   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerrp   loggerModuler    rw   r   r   r   r   r   r   r  r   r7  rX  r-  r{  r  r  r~  r  r\   __all__r   rT   rS   <module>r     sa  $    " " " " " " " "        A A A A A A A A A A                        . - - - - - Q Q Q Q Q Q Q Q , , , , , , , , , , , , , , c c c c c c c c c c c c c c c c 
	H	%	%w= w= w= w= w=bi w= w= w=tK. K. K. K. K. K. K. K.\,; ,; ,; ,; ,;bi ,; ,; ,;^./ ./ ./ ./ ./RY ./ ./ ./b; ; ; ; ;	 ; ; ;D,; ,; ,; ,; ,;") ,; ,; ,;^79 79 79 79 79 79 79 79t:
 :
 :
 :
 :
29 :
 :
 :
z    ")     \ \ \ \ \? \ \ \4 t
 t
 t
 t
 t
% t
 t
 t
n E
 E
 E
 E
 E
+ E
 E
 E
P* * * * *") * * *>   N
 N
 N
 N
 N
%9 N
 N
 N
b c
 c
 c
 c
 c
1 c
 c
 c
L ?
 ?
 ?
 ?
 ?
"6 ?
 ?
 ?
D    bi   & J
 J
 J
 J
 J
 4 J
 J
 J
Z4 4 4 4"  rT   