
     `ib                        d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%  e!j&        e'          Z(d*dZ) G d dej*                  Z+ G d dej*                  Z, G d de          Z-e  G d de                      Z.e  G d de.                      Z/ e d           G d  d!e.e                      Z0 e d"           G d# d$e.                      Z1e  G d% d&e.                      Z2e  G d' d(e.                      Z3g d)Z4dS )+zPyTorch MPT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MptConfig   c                 j   t          j        d|z
  dt           j        |                              ddd|          }dt	          j        t	          j        |                     z  }t          j        d|dz   t           j        |                                          }|||z  z  }dt          j	        d|          z  }|                    d|dd          }|| k    rAt          j
        |ddddddf         |ddddddf         gd          ddd| df         }||z  }|                    d          S )	a  
    Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr8   -   sH    L_,au{6RRRWWXY[\^_apqqE	$)I*>*> ? ??</!35;vVVV\\^^D>$889D59Q%%%F[[0!Q77Fy((vaaaAsl3VAAAsssCK5HIqQQQRSRSRSU_V_U_adRdeFNE==    c                        e Zd ZdZddedee         f fdZ eddd	          	 	 	 dd
e	j
        de	j
        dee         dee	j
                 dee	j
                 f
d            Z xZS )MptAttentionzzMulti-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    Nconfig	layer_idxc                 *   t                                                       |j        | _        |j        | _        |j        | _        | j        | j        z  | _        |j        j        | _        | j        )dt          j
        | j        | j        z            z  | _        |j        j        | _        |j        j        | _        t          j        | j        d| j        z  d          | _        t          j        | j        | j        d          | _        || _        d S )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler(   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr=   )selfr<   r=   	__class__s      r7   rB   zMptAttention.__init__I   s    !-~$0(DL8#/=%!"TYt/?$,/N%O%O!OD$0;*3Id.D4D0D5QQQ		$"2D4D5QQQ"r9   past_key_valuepast_key_values4.58new_nameversionhidden_statesposition_biasattention_maskcache_positionc                    |j         d d         \  }}|                     |          }| j        r"|                    | j         | j                  }|                    dd          \  }	}
}|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
|                    ||| j        | j                                      dd          }|$d|i}|	                    |
|| j
        |          \  }
}t          j        |	|
                    dd                    | j        z  }||n||                                z   }|t          |j                   dk    r$t!          d	t          |j                              |
j         d         }t#          d
|                    d          |z
            }t#          d
|                    d          |z
            }|d d |d |d f         }||z   }|2|                    |t          j        |	j                  j                  }t.          j                            |                                d                              |j                  }t.          j                            || j        | j                  }t          j        ||          }|                    d
ddd                                           !                    ||d          }| "                    |          }||fS )Nr    )minmaxr   r"   r   r\   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperO   rM   clampchunkreshaperD   rG   	transposeupdater=   r$   matmulrI   get_seq_lengthlen
ValueErrorr_   sizemasked_fillfinfor   r^   r   r
   softmaxr,   todropoutrL   rd   permute
contiguousr'   rP   )rQ   rY   rZ   rT   r[   r\   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                        r7   forwardzMptAttention.forwardY   s-    "/!4RaR!8
JIIm,,	= 	O!T]NNNI1:1J1J.j,#++J
DLRVR_``jjklnopp''
Jdm\\ffghjkll
#++J
DLRVR_``jjklnopp&,n=L'6'='=j,X\Xfht'u'u$J <j6J6J2r6R6RSSVZVhh%4%<zz*OmOmOoOoBo$=&''1,, !tZ]^k^qZrZr!t!tuuu#)"-J(+A}/A/A!/D/D|/S(T(T%&)!]-?-?-B-BZ-O&P&P#)!!!-F-G-GI`IaIa*abM/-?%/;;NEKXdXjLkLkLopp },,-=-C-C-E-E2,NNQQR^Rdee},,\T=P[_[h,iilLAA'//1a;;FFHHMMjZdfhiimmN33L((r9   N)NNN)__name__
__module____qualname____doc__r   r   intrB   r   r$   Tensorr   r   __classcell__rR   s   @r7   r;   r;   D   s         # #y #Xc] # # # # # #  _%0A6RRR
 ,015151) 1)|1) |1) "%	1)
 !.1) !.1) 1) 1) SR1) 1) 1) 1) 1)r9   r;   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )MptMLPr<   c                 (   t                                                       |j        }t          j        |d|z  d          | _        t          j        d          | _        t          j        d|z  |d          | _        |j	        j
        | _        d S )N   Fr?   none)approximate)rA   rB   rC   r   rN   up_projGELUact	down_projrH   rK   hidden_dropout)rQ   r<   rC   rR   s      r7   rB   zMptMLP.__init__   s    (ya+oEJJJ7v...1{?KeLLL$0;r9   rY   residualreturnc                     |                      |                     |                    }|                     |          }t          j        || j        | j                  }||z   }|S )Nrb   )r   r   r   Frt   r   rd   )rQ   rY   r   intermediate_outputoutputs        r7   r   zMptMLP.forward   s^    m!<!<=="nn];;.$2EPTP]^^^("r9   )	r   r   r   r   rB   r$   r   r   r   r   s   @r7   r   r      ss        <y < < < < < <U\ U\ el        r9   r   c                        e Zd Zddedee         f fdZ	 	 	 	 ddej        dej        dej        d	ee	         d
e
de
deej                 fdZ xZS )MptBlockNr<   r=   c                    t                                                       |j        }t          ||j                  | _        d | j        _        |j        | _        t          ||          | _
        t          ||j                  | _        d | j        _        t          |          | _        |j        j        | _        t#          j        | j                  | _        d S )Neps)rA   rB   rC   r   layer_norm_epsilonnorm_1r@   rD   r0   r;   attnnorm_2r   ffnrH   rK   dropout_rater   Dropoutresid_attn_dropout)rQ   r<   r=   rC   rR   s       r7   rB   zMptBlock.__init__   s    (1JKKK 33	1JKKK&>>".9"$*T->"?"?r9   FrY   rZ   r[   
layer_past	use_cacheoutput_attentionsr\   c                     |                      |          }|}	|                     |||||          \  }
}|                     |
          |	z   }|                     |          }|}	|                     ||	          }||fS )N)rZ   r[   rT   r\   )r   r   r   r   r   )rQ   rY   rZ   r[   r   r   r   r\   layernorm_outputr   attn_outputsr   r   s                r7   r   zMptBlock.forward   s      ;;}55  &*YY')&) &/ &
 &
"l //==H;;}55 ! *H55|##r9   r   )NFFN)r   r   r   r   r   r   rB   r$   r   r   boolr   r   r   s   @r7   r   r      s        @ @y @Xc] @ @ @ @ @ @2 '+"'15"$ "$|"$ |"$ 	"$
 UO"$ "$  "$ !."$ "$ "$ "$ "$ "$ "$ "$r9   r   c            	            e Zd ZU eed<   dZdZdgZdgZ fdZ	de
j        fdZe ed	d
d          d
eeej        ej        f                  deeej        ej        f                  fd                        Z xZS )MptPreTrainedModelr<   transformerTr   z
lm_head.*.c                 :     t                      j        |i | d S r   )rA   rB   )rQ   inputskwargsrR   s      r7   rB   zMptPreTrainedModel.__init__   s%    &+F+++++r9   modulec                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t                    rF|j        |j        j        	                                 |j        j                            d           dS dS )zInitialize the weights.g        )meanstdNr!   )
isinstancer   rN   weightdatanormal_r<   initializer_ranger@   zero_	Embeddingpadding_idxr   fill_)rQ   r   s     r7   _init_weightsz MptPreTrainedModel._init_weights   s/   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .-	** 	*{& &&(((M$$S)))))	* 	*r9   rS   rT   rU   rV   r   c                 |    | d         d         j         \  }}||z  t          fd| D                       S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c              3      K   | ]>}|d                                         |d                                        fV  ?dS )r   r   N)rh   ).0r   batch_size_times_num_headsrG   rx   s     r7   	<genexpr>z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>  so       
 

  1%%&@(JWW1%%&@*hWW
 
 
 
 
 
r9   )re   tuple)rT   rw   r0   r   rG   rx   s      @@@r7   _convert_to_mpt_cachez(MptPreTrainedModel._convert_to_mpt_cache   st     7Fa6H6K6Q3
Ix%/)%;"  
 
 
 
 
 

 .
 
 
 
 
 	
r9   )r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingrB   r   Moduler   staticmethodr   r   r$   r   r   r   r   s   @r7   r   r      s         %&*##'4o#, , , , ,*BI * * * *" _%0A6RRR
uU\5<%?@A
	uU\5</0	1
 
 
 SR \
 
 
 
 
r9   r   c                   Z    e Zd Zdef fdZd ZddZdej        fdZ	e
	 	 	 	 	 	 	 	 	 dd	eej                 d
ee         deej                 deej                 dee         dee         dee         dee         deej                 deeej        df         ef         fd            Z xZS )MptModelr<   c                    t                                                     j        | _        j        | _        t          j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          | j        j                  | _        d | j        _        d| _        |                                  d S )Nc                 2    g | ]}t          |           S ))r=   )r   )r   ir<   s     r7   
<listcomp>z%MptModel.__init__.<locals>.<listcomp>  s&    $c$c$cqXf%B%B%B$c$c$cr9   r   F)rA   rB   rC   rD   r0   r   r   
vocab_sizewte
ModuleListrangen_layersblocksr   r   norm_fr@   gradient_checkpointing	post_initrQ   r<   rR   s    `r7   rB   zMptModel.__init__  s       !- < 143CDD m$c$c$c$cERXRaLbLb$c$c$cdd   0f6OPPP&+# 	r9   c                     | j         S r   r   )rQ   s    r7   get_input_embeddingszMptModel.get_input_embeddings&  s	    xr9   r   Nc                 &    t          ||||          S r   )r8   )rQ   r0   r1   r2   r   s        r7   r8   zMptModel.build_mpt_alibi_tensor)  s    %i.RXYYYr9   new_embeddingsc                     || _         d S r   r   rQ   r   s     r7   set_input_embeddingszMptModel.set_input_embeddings,  s    !r9   	input_idsrT   r[   inputs_embedsr   r   output_hidden_statesreturn_dictr\   r   .c
           
         ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }||t          d          ||j        \  }}n||j        \  }}}nt          d          | j        r%| j        r|rt          
                    d           d}||                     |          }|r|t          | j                   }|rCt          |t                    r.t          
                    d           t          j        |          }|}|rdnd}|rdnd}||                                nd	}||z   }|t#          j        ||f|j        
          }n|                    |j                  }|                     | j        | j         j        |j        
          }t1          |||f||          }|                                }| j        D ]2}|r||fz   } ||||||||	          }|d	         }|r||d         fz   }3|                     |          }|r||fz   }|st          d ||||fD                       S t9          ||||          S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r<   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`. r   r   )r   r[   r   r   rZ   r\   r   c              3      K   | ]}||V  	d S r   r   )r   vs     r7   r   z#MptModel.forward.<locals>.<genexpr>  s1        ghgtgtgtgtgt r9   )last_hidden_staterT   rY   
attentions)r<   r   r   r   use_return_dictrn   re   r   rd   loggerwarning_oncer   r   r   r   from_legacy_cacherl   r$   onesr   rs   r8   r0   rE   r   r   r   r   r   )rQ   r   rT   r[   r   r   r   r   r   r\   r   rw   rx   _rY   all_self_attentionsall_hidden_statespast_key_values_lengthseq_length_with_pastr3   causal_maskblockoutputss                          r7   r   zMptModel.forward/  s]   6 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ]%>cddd"%._"J

&(5(;%J
AATUUU& 	"4= 	" "##p   "	  HHY//M 	?0*$+>>>O 	NOU;; 	NU  
 +<_MMO%$5?bb4"6@BBD FUE`!?!?!A!A!Afg),BB!"Z5I(JS`SghhhNN+..}/CDDN++DNDK<S\i\p+qq7Z4mE[
 
 "&&(([ 	J 	JE# I$58H$H!e**#"3#-  G $AJM  J&9WQZM&I# M22 	E 1]4D D 	  )?<MObc      9+++*	
 
 
 	
r9   r   N	NNNNNNNNN)r   r   r   r   rB   r   r8   r$   r   r   r   r   
LongTensorr   r   r   r   r   r   r   r   s   @r7   r   r     s       y      ,  Z Z Z Z"5< " " " "  15+/1548$(,0/3&*15t
 t
E,-t
 "%t
 !.	t

   01t
 D>t
 $D>t
 'tnt
 d^t
 !.t
 
uU\3&')RR	St
 t
 t
 ^t
 t
 t
 t
 t
r9   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   j    e Zd ZdgZdef fdZdej        fdZe		 	 	 	 	 	 	 	 	 	 dde
ej                 de
e         d	e
ej                 d
e
ej                 de
ej                 de
e         de
e         de
e         de
e         de
ej                 deeej                 ef         fd            Z xZS )MptForCausalLMzlm_head.weightr<   c                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S NFr?   )
rA   rB   r   r   r   rN   rC   r   lm_headr   r   s     r7   rB   zMptForCausalLM.__init__  sa       #F++y!3V5FUSSS 	r9   r   c                     || _         d S r   )r  r   s     r7   set_output_embeddingsz$MptForCausalLM.set_output_embeddings  s    %r9   Nr   rT   r[   r   labelsr   r   r   r   r\   r   c                    |	|	n| j         j        }	|                     ||||||||	|
	  	        }|d         }|                     |          }d}|5|                    |j                  } | j        ||fd| j         j        i|}|	s|f|dd         z   }||f|z   n|S t          |||j	        |j
        |j                  S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)rT   r[   r   r   r   r   r   r\   r   r   r   losslogitsrT   rY   r   )r<   r   r   r  rs   r   loss_functionr   r   rT   rY   r   )rQ   r   rT   r[   r   r  r   r   r   r   r\   r   transformer_outputsrY   	lm_logitsr  r   s                    r7   r   zMptForCausalLM.forward  s)   @ &1%<kk$+B]"..+)'/!5#) / 

 

 ,A.LL//	YYy/00F%4%   ;1 	 D  	F\$7$;;F)-)9TGf$$vE0/?-;*5
 
 
 	
r9   )
NNNNNNNNNN)r   r   r   _tied_weights_keysr   rB   r$   r   r  r   r   r
  r   r   r   r   r   r   r   r   s   @r7   r  r    s{        ++y      &EL & & & &  15+/1504)-$(,0/3&*15F
 F
E,-F
 "%F
 !.	F

  -F
 &F
 D>F
 $D>F
 'tnF
 d^F
 !.F
 
uU\"$EE	FF
 F
 F
 ^F
 F
 F
 F
 F
r9   r  a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   2    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddeej                 dee	         deej
                 deej
                 deej
                 d	ee         d
ee         dee         dee         deeej
                 ef         fd            Z xZS )MptForSequenceClassificationr<   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j        d          | _        | 	                                 d S r  )
rA   rB   
num_labelsr   r   r   rN   rC   scorer   r   s     r7   rB   z%MptForSequenceClassification.__init__  sk        +#F++Yv163D5QQQ
 	r9   Nr   rT   r[   r   r  r   r   r   r   r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }||j        d         }n|j        d         }| j         j        |dk    rt          d          | j         j        d}n|}|| j         j        k                        |j        t          j
                  }t          j        |j        d         |j        t          j
                  }||z                      d          }n)d}t                              | j        j         d           |t          j        ||j        	          |f         }d}|.| j         j        f| j        dk    rd
| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        d
k    rWt-                      }| j        dk    r1 ||                                |                                          }nb |||          }nU| j         j        dk    rt1                      } |||          }n*| j         j        dk    rt3                      } |||          }|	s|f|
dd         z   }||f|z   n|S t5          |||
j        |
j        |
j                  S )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrT   r[   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r`   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r<   r   r   r   re   pad_token_idrn   rs   r   r$   r&   r%   argmaxr   r   rR   r   problem_typer  r   longr   r	   r/   r   r   r   rT   rY   r   )rQ   r   rT   r[   r   r  r   r   r   r   r  rY   r  rw   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr   s                        r7   r   z$MptForSequenceClassification.forward  s   < &1%<kk$+B]"..+)'/!5# / 	
 	
 ,A.M** "+JJ&,Q/J;#+
a\]]];#+!#"%)AAEEfmUZU`aaL!L)<V]Z_ZefffM"/,">!F!Fr!J!J!#>* Z Z Z  
 u|Jv}MMMOaab{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--xv66)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r9   r	  )r   r   r   r   rB   r   r   r$   r
  r   r   r   r   r   r   r   r   r   s   @r7   r  r    s>       y        15+/1504)-$(,0/3&*d
 d
E,-d
 "%d
 !.	d

  -d
 &d
 D>d
 $D>d
 'tnd
 d^d
 
uU\"$DD	Ed
 d
 d
 ^d
 d
 d
 d
 d
r9   r  c                   2    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddeej                 dee	         deej
                 deej
                 deej
                 d	ee         d
ee         dee         dee         deeej
                 ef         fd            Z xZS )MptForTokenClassificationr<   c                    t                                          |           |j        | _        t          |          | _        t          |d          r|j        |j        }n!t          |d          r|j        |j        }nd}t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S )Nclassifier_dropoutr   g?)rA   rB   r  r   r   hasattrr3  r   r   r   rt   rN   rC   
classifierr   )rQ   r<   r3  rR   s      r7   rB   z"MptForTokenClassification.__init__  s        +#F++6/00 	%V5N5Z!'!:V-.. 	%63H3T!'!6!$z"455)F$68IJJ 	r9   Nr   rT   r[   r   r  r   r   r   r   r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }|d         }|                     |          }|                     |          }d}|p|                    |j                  }|j        \  }}t                      } ||	                    ||z  | j
                  |	                    ||z                      }|	s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )r"  Nr#  r   r    )r  r  rY   r   )r<   r   r   rt   r5  rs   r   re   r   r'   r  r   rY   r   )rQ   r   rT   r[   r   r  r   r   r   r   deprecated_argumentsr  rY   r  r  rw   rx   r/  r   s                      r7   r   z!MptForTokenClassification.forward  sL   > &1%<kk$+B]"..+)'/!5# / 	
 	
 ,A.]33//YYv}--F%+\"J
'))H8J3T_EEv{{S]`jSjGkGk D  	FY!4QRR!88F)-)9TGf$$vE$-;*5	
 
 
 	
r9   r	  )r   r   r   r   rB   r   r   r$   r
  r   r   r   r   r   r   r   r   r   s   @r7   r1  r1    s>       y      "  15+/1504)-$(,0/3&*B
 B
E,-B
 "%B
 !.	B

  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\"$99	:B
 B
 B
 ^B
 B
 B
 B
 B
r9   r1  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 dee	         d	ee	         d
ee	         de
eef         fd            Z xZS )MptForQuestionAnsweringc                     t                                          |           t          |          | _        t	          j        |j        d          | _        |                                  d S )Nr    )	rA   rB   r   r   r   rN   rC   
qa_outputsr   r   s     r7   rB   z MptForQuestionAnswering.__init__  sY       #F++)F$6:: 	r9   Nr   r[   r   start_positionsend_positionsr   r   r   r   c	                    ||n| j         j        }|                     ||||||          }	|	d         }
|                     |
          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|	dd         z   }||f|z   n|S t          ||||	j        |	j        	          S )
r   N)r[   r   r   r   r   r   r   r`   r"   )ignore_indexr    )r  start_logits
end_logitsrY   r   )r<   r   r   r;  splitr/   rv   rm   ro   rf   r   r   rY   r   )rQ   r   r[   r   r<  r=  r   r   r   r  sequence_outputr  r@  rA  
total_lossignored_indexr/  
start_lossend_lossr   s                       r7   r   zMptForQuestionAnswering.forward  s   2 &1%<kk$+B]"")'/!5# # 
 
 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r9   )NNNNNNNN)r   r   r   rB   r   r   r$   r
  FloatTensorr   r   r   r   r   r   r   s   @r7   r9  r9    s              156:596:48,0/3&*E
 E
E,-E
 !!23E
   12	E

 "%"23E
   01E
 $D>E
 'tnE
 d^E
 
u22	3E
 E
 E
 ^E
 E
 E
 E
 E
r9   r9  )r  r   r   r  r1  r9  r  )5r   r(   typingr   r   r$   r   torch.nnr   r   r   r	   r
   r   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mptr   
get_loggerr   r   r8   r   r;   r   r   r   r   r  r  r1  r9  __all__r   r9   r7   <module>rV     s      " " " " " " " "        L L L L L L L L L L L L $ $ $ $ $ $ . . . . . . . . ) ) ) ) ) ) I I I I I I 9 9 9 9 9 9              . - - - - - , , , , , , , , 0 0 0 0 0 0 ( ( ( ( ( ( 
	H	%	%   .G) G) G) G) G)29 G) G) G)T    RY   *7$ 7$ 7$ 7$ 7$) 7$ 7$ 7$t -
 -
 -
 -
 -
 -
 -
 -
` U
 U
 U
 U
 U
! U
 U
 U
p   U
 U
 U
 U
 U
' U
 U
 U
p   o
 o
 o
 o
 o
#5 o
 o
 o
d U
 U
 U
 U
 U
 2 U
 U
 U
p O
 O
 O
 O
 O
0 O
 O
 O
d  r9   