
     `iA                    x   d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e$j)        e*          Z+dej,        de-de-fdZ. G d dej/                  Z0 G d dej1                  Z2 G d de          Z3 G d de          Z4 G d dej1                  Z5 G d  d!ej1                  Z6e# G d" d#e!                      Z7 G d$ d%e7          Z8 G d& d'e7          Z9e# G d( d)e7                      Z: e#d*+           G d, d-e7e                      Z; e#d.+           G d/ d0e7                      Z<e# G d1 d2e7                      Z= G d3 d4e7          Z> G d5 d6e7e          Z?g d7Z@dS )8zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     |                      | j                  }| ddddf                                         |ddddf<   ||dddf<   |t          d          |                    |dk    |           |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr+   3   s     "++IO<<(CRC06688aaae4aaadLMMM""#4#<lKKK    c                   h     e Zd ZdZdedef fdZ	 ddej        ded	eej                 f fd
Z	 xZ
S )MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 j    d| _         t                                          || j         z   |           d S N   )offsetsuper__init__)selfr/   r0   	__class__s      r*   r6   z&MvpLearnedPositionalEmbedding.__init__I   s3     $+5}EEEEEr,   r   Nr   past_key_values_lengthposition_idsc                 0   |V|j         dd         \  }}t          j        |||z   t          j        | j        j                                      |d          }n|                    d          }t                      	                    || j
        z             S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr3   )dtypedevicer#   r   )r%   torcharangelongweightr=   expand	unsqueezer5   forwardr4   )r7   r   r9   r:   bszseq_lenr8   s         r*   rD   z%MvpLearnedPositionalEmbedding.forwardO   s    
 $?2A2.LC <&(>(HPUPZcgcncu  fS"oo L (11!44Lww|dk9:::r,   )r   N)__name__
__module____qualname____doc__intr6   r>   Tensorr   rD   __classcell__r8   s   @r*   r.   r.   D   s         Fs F3 F F F F F F pt; ;;?B;V^_d_kVl; ; ; ; ; ; ; ; ; ;r,   r.   c                       e Zd ZdZ	 	 	 	 ddededee         d	ee         d
ee         dee         f fdZ e	ddd          	 	 	 	 	 	 	 dde
j        dee
j                 dee         dee
j                 dee
j                 dee
j                 dedee
j                 dee
j        ee
j                 eee
j                          f         fd            Z xZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _        || _	        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rV   )r5   r6   rR   rS   rT   head_dimr'   scalingrU   rW   r   Lineark_projv_projq_projout_proj)r7   rR   rS   rT   rU   rV   rW   r8   s          r*   r6   zMvpAttention.__init__b   s	    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr,   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskattn_promptoutput_attentionscache_positionreturnc	                    |du}	|                                 \  }
}}|                     |          | j        z  }d}|Ht          |t                    r1|j                            | j                  }|	r|j        }n
|j	        }n|}|	r|n|}|	r3|1|r/|j
        | j                 j        }|j
        | j                 j        }n|                     |          }|                     |          }|                    |
d| j        | j                                      dd          }|                    |
d| j        | j                                      dd          }|N|	s|nd}|                    ||| j        d|i          \  }}|	r$t          |t                    rd|j        | j        <   |t)          j        |d                             |
ddd          |gd	          }t)          j        |d                             |
ddd          |gd	          }|`t)          j        |
d||d                              d                                        |j                  }t)          j        ||gd	          }|
| j        z  d| j        f}|                    |
|| j        | j                                      dd          } |j        | } |j        | } |j        | }|                     d          }t)          j        ||                    dd                    }|                                 |
| j        z  ||fk    r2t9          d
|
| j        z  ||f d|                                            ||                                 |
d||fk    r+t9          d|
d||f d|                                            |                    |
| j        ||          |z   }|                    |
| j        z  ||          }t:          j                            |d	          }||                                 | j        fk    r-t9          d| j        f d|                                            |                    dddd          |                    |
| j        ||          z  }|                    |
| j        z  ||          }|r=|                    |
| j        ||          }|                    |
| j        z  ||          }nd}t:          j                             || j         | j!                  }t)          j        ||          }|                                 |
| j        z  || j        fk    r5t9          d|
| j        || j        f d|                                            |                    |
| j        || j                  }|                    dd          }|                    |
|| j"                  }| #                    |          }||fS )z#Input shape: Batch x Time x ChannelNFr#   r   r3   rm   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )$sizer_   r[   
isinstancer   
is_updatedgetrW   cross_attention_cacheself_attention_cachelayerskeysvaluesr]   r^   viewrS   rZ   	transposeupdater>   catrB   zerostor=   reshapebmmr'   r   
functionalsoftmaxrT   rt   rR   r`   )r7   rg   rh   rb   ri   rj   rk   rl   rm   is_cross_attentionrE   tgt_len_query_statesrw   curr_past_key_valuecurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                             r*   rD   zMvpAttention.forward   s     .T9',,..Wa {{=11DL@
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#-?R))] 	F/"=*"=,3DNCHJ.5dnELLL^44J;;~66L#b$.$-PPZZ[\^_``J',,S"dndmTT^^_`bcddL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>"KN$9$9#r2r$J$JJ#W]^___J 9k!n&;&;CR&L&Ll%[abcccL)#k#q';q>;N;Nq;Q;QRRUUVdVkll!&K+Hr!S!S!SDN*B>
#((gt~t}UU__`acdee+|+Z8'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 %""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK00111r,   )rQ   FTN)NNNNNFN)rG   rH   rI   rJ   rK   r   floatboolr6   r   r>   rL   r   tuplerD   rM   rN   s   @r*   rP   rP   _   s       GG $'%*#$(C CC C %	C
 TNC tnC D>C C C C C C: _%0A6RRR 48+/1526.2"'15}2 }2|}2 #5<0}2 "%	}2
 !.}2 "%,/}2 el+}2  }2 !.}2 
u|Xel3XeEL>Q5RR	S}2 }2 }2 SR}2 }2 }2 }2 }2r,   rP   c                        e Zd Zdef fdZ	 ddej        dej        dej        dej        dee         d	e	ej        eej                 f         fd
Z
 xZS )MvpEncoderLayerconfigc                    t                                                       |j        | _        t	          | j        |j        |j                  | _        t          j	        | j                  | _
        |j        | _        t          |j                 | _        |j        | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j	        | j                  | _        d S )N)rR   rS   rT   )r5   r6   d_modelrR   rP   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrT   r
   activation_functionactivation_fnactivation_dropoutr\   encoder_ffn_dimfc1fc2final_layer_normr7   r   r8   s     r*   r6   zMvpEncoderLayer.__init__  s    %n4,
 
 

 %'L$@$@!~#F$>?"(";9T^V-CDD9V3T^DD "T^ < <r,   Frg   ri   rj   self_attn_promptrl   rn   c                 j   |}|                      |||||          \  }}t          j                            || j        | j                  }||z   }|                     |          }|}|                     |                     |                    }t          j                            || j        | j                  }| 	                    |          }t          j                            || j        | j                  }||z   }| 
                    |          }|j        t          j        k    rt          j        |                                          s&t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }||fS )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rg   ri   rj   rk   rl   rr   i  )minmax)r   r   r   rT   rt   r   r   r   r   r   r   r<   r>   float16isinfanyisnanfinfor   clamp)	r7   rg   ri   rj   r   rl   residualr   clamp_values	            r*   rD   zMvpEncoderLayer.forward  s   * !&*nn')+(/ '5 '
 '
#| --mt|VZVc-dd =011-@@ **488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0--m<<%-//K&&**,, 005M0J0J0N0N0P0P 0  +m&9::>EK!KK<[YYYMl**r,   )F)rG   rH   rI   r   r6   r>   FloatTensorr   r   r   rD   rM   rN   s   @r*   r   r      s        =y = = = = = =, -2/+ /+(/+ )/+ *	/+
  +/+ $D>/+ 
u (5+<"==	>/+ /+ /+ /+ /+ /+ /+ /+r,   r   c            !           e Zd Zddef fdZ eddd          	 	 	 	 	 	 	 	 	 	 	 dd
ej        deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee	         dee
         dee
         deej                 deej        eeej        ej        f                  f         fd            Z xZS )MvpDecoderLayerNr   c                    t                                                       |j        | _        t	          | j        |j        |j        d|          | _        |j        | _        t          |j
                 | _        |j        | _        t          j        | j                  | _        t	          | j        |j        |j        d|          | _        t          j        | j                  | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S )NT)rR   rS   rT   rU   rW   )rT   rU   rW   )r5   r6   r   rR   rP   decoder_attention_headsr   r   rT   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr\   decoder_ffn_dimr   r   r   )r7   r   rW   r8   s      r*   r6   zMvpDecoderLayer.__init__D  s   %n4,
 
 
 ~#F$>?"(";$&L$@$@!(N*,
 
 
 (*|DN'C'C$9T^V-CDD9V3T^DD "T^ < <r,   ra   rb   rc   rd   FTrg   ri   encoder_hidden_statesencoder_attention_maskrj   cross_attn_layer_head_maskr   cross_attn_promptrl   	use_cacherm   rn   c           	      $   |}|                      ||	||||
|          \  }}t          j                            || j        | j                  }||z   }|                     |          }d}|g|}|                     ||||||	|
          \  }}t          j                            || j        | j                  }||z   }|                     |          }|}|                     | 	                    |                    }t          j                            || j
        | j                  }|                     |          }t          j                            || j        | j                  }||z   }|                     |          }|f}|
r|||fz  }|S )a1  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rg   rb   ri   rj   rk   rl   rm   rr   N)rg   rh   ri   rj   rk   rb   rl   )r   r   r   rT   rt   r   r   r   r   r   r   r   r   )r7   rg   ri   r   r   rj   r   r   r   rb   rl   r   rm   r   self_attn_weightscross_attn_weightsoutputss                    r*   rD   zMvpDecoderLayer.forward`  s   L ! ,0>>'+)+(/) ,: ,
 ,
(( --mt|VZVc-dd =011-@@ " ,$H040A0A+!65 :- /"3 1B 1 1-M- M11-4<Z^Zg1hhM$}4M 88GGM !**488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0--m<< " 	?)+=>>Gr,   N)NNNNNNNNFTN)rG   rH   rI   r   r6   r   r>   rL   r   r   r   r   r   rD   rM   rN   s   @r*   r   r   C  s       = =y = = = = = =8 _%0A6RRR 268<9=26=A3748+/,1$(15U U|U !.U  (5	U
 !) 6U "%,/U %-U\$:U #5<0U $EL1U "%U $D>U D>U !.U 
u (51BEDU1U+V"WW	XU U U SRU U U U Ur,   r   c                   X     e Zd ZdZdedededef fdZdej        dej        fd	Z	 xZ
S )
MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t                                                       t          j        ||          | _        t          j        |          | _        t          j        ||          | _        d S )Nrs   )r5   r6   r   r\   denseDropoutrT   r`   )r7   r   r   r   r   r8   s        r*   r6   zMvpClassificationHead.__init__  sY     	Yy)44
zN333	)[99r,   rg   rn   c                     |                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S r   )rT   r   r>   tanhr`   )r7   rg   s     r*   rD   zMvpClassificationHead.forward  s[    ]33

=11
=11]33m44r,   )rG   rH   rI   rJ   rK   r   r6   r>   rL   rD   rM   rN   s   @r*   r   r     s        77
:
: 
: 	
:
 
: 
: 
: 
: 
: 
:U\ el        r,   r   c                   R     e Zd ZdZ fdZdej        deej                 fdZ xZ	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c           	         t                                                       |j        | _        || _        || _        |j        |z  | _        t          j        |j	                  | _	        t          j
        |j        |j                  | _        t          j        t          j        |j        |j                  t          j                    t          j        |j        |dz  |j        z                      | _        d S )Nr   r3   )r5   r6   prompt_length
num_layersrS   r   rZ   r   r   rT   	Embeddingprompt_embedding
Sequentialr\   prompt_mid_dimGELUprompt_trans)r7   r   r   rS   r8   s       r*   r6   zMvpPrompt.__init__  s    #1$")3zFN333 "V-A6> R RMIfnf&;<<GIIIf+Z!^fn-LMM
 
r,   
prompt_idsrn   c                 2   |                      |                     |                    }|                    | j        | j        dz  | j        | j                  }|                     |          }|                    g d          	                    d          }|S )Nr3   )r   r3   r   r	   )
r   r   r~   r   r   rS   rZ   rT   permutesplit)r7   r   prompts      r*   rD   zMvpPrompt.forward  s    ""4#8#8#D#DEET/11DdnVZVcddf%%--33A66r,   )
rG   rH   rI   rJ   r6   r>   rL   r   rD   rM   rN   s   @r*   r   r     si        33
 
 
 
 
%, 53F        r,   r   c                   >    e Zd ZU eed<   dZdZd Zed             Z	dS )MvpPreTrainedModelr   modelTc                    | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 d S d S t          |t          j
                  rS|j        j                            d|           |j        -|j        j        |j                 	                                 d S d S d S )NrQ   )meanstd)r   init_stdrv   r   r\   rA   datanormal_rV   zero_r   padding_idx)r7   moduler   s      r*   _init_weightsz MvpPreTrainedModel._init_weights  s    k"fbi(( 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r,   c                     | j         j        }t          j        g ddddd|gg| j                  }|                    |          |d}|S )N)r      
      r3   r         r3   r=   )ri   r   )r   r    r>   tensorr=   ne)r7   	pad_tokenr   dummy_inputss       r*   r   zMvpPreTrainedModel.dummy_inputs  sa    K,	L"2"2"2Q2q)4L!MVZVabbb	'll955"
 
 r,   N)
rG   rH   rI   r   __annotations__base_model_prefixsupports_gradient_checkpointingr   propertyr    r,   r*   r   r     sX         &*#	? 	? 	?   X  r,   r   c                       e Zd ZdZ	 ddedeej                 dee         f fdZ		 	 	 	 	 	 	 ddee
j                 d	ee
j                 d
ee
j                 dee
j                 dee         dee         dee         deeef         fdZ xZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   embed_tokens
use_promptc                    t                                                     j        | _        j        | _        j        }j        | _        j        | _	        j
        rt          j        |          nd| _        ||| _        n%t          j        j        || j                  | _        t%          j        |          | _        t          j        fdt+          j                  D                       | _        t          j        |          | _        || _        |r,j        | _        t9          j        j                  | _        d| _        |                                   d S )N      ?c                 .    g | ]}t                    S r  )r   ).0r   r   s     r*   
<listcomp>z'MvpEncoder.__init__.<locals>.<listcomp>(  s!    $c$c$c_V%<%<$c$c$cr,   F)!r5   r6   rT   encoder_layerdrop	layerdropr   r    r   max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler  r   r   
vocab_sizer.   embed_positions
ModuleListrangeencoder_layersr{   r   layernorm_embeddingr  r   r   r   r   gradient_checkpointing	post_init)r7   r   r  r  rR   r8   s    `   r*   r6   zMvpEncoder.__init__  sU    	   ~1N	!.$*$B!393IR49Y///s# ,D "V->	4K[ \ \D<* 
  
 m$c$c$c$ceFLaFbFb$c$c$cdd#%<	#:#: $ 	!'!5D$-%.% %D! ',#r,   r   ri   	head_maskinputs_embedsrl   output_hidden_statesreturn_dictrn   c                    ||n| j         j        }||n| j         j        }||n| j         j        }||t	          d          |&|}|j        }	|                    d|	d                   }n=|,|                                dd         }	|dddddf         }nt	          d          ||                     |          | j	        z  }| 
                    |          }
||
z   }|                     |          }t          j                            || j        | j                  }| j        rFt#          j        | j                                      | j                  }|                     |          }|t/          ||j                  }|rdnd}|rdnd}|p|                                d         t3          | j                  k    r@t	          dt3          | j                   d	|                                d          d
          t7          | j                  D ]{\  }}|r||fz   }d}| j        r!t#          j        g           }|| j        k     rd}|rd}n1 ||||||         nd| j        r||         nd|          }|d         }|r||d         fz   }||r||fz   }|st=          d |||fD                       S t?          |||          S )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   z5You have to specify either input_ids or inputs_embedsrr   r  r   z&The head_mask should be specified for  layers, but it is for .FT)NN)rj   r   rl   r   c              3      K   | ]}||V  	d S r   r  r
  vs     r*   	<genexpr>z%MvpEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer,   last_hidden_staterg   
attentions) r   rl   r  use_return_dictr'   r%   r~   ru   r  r  r  r  r   r   rT   rt   r  r>   r?   r   r   r=   r   r   r<   lenr{   	enumeraterandr  r   r   )r7   r   ri   r  r  rl   r  r  inputinput_shape	embed_posrg   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r*   rD   zMvpEncoder.forward8  s   \ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]  ]%>cddd"E+K!r;r?;;II&',,..ss3K!!!!QQQ(+EETUUU  --i884;KKM((//	%	100??--mt|VZVc-dd ? 	Ad&899<<T[IIJ#44Z@@ %7H[\\N3=0:d  ~~"s4;'7'788 /S=M=M / /!((+/ / /  
 #,DK"8"8 	F 	FC# C!/=2B!BG} #&+jnn#&77"G 1 , -!"7@7LYs^^RV?C&X&6s&;&;TX&7! ! ! !.a 0  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r,   NF)NNNNNNN)rG   rH   rI   rJ   r   r   r   r   r   r6   r>   
LongTensorrL   r   r   r   r   rD   rM   rN   s   @r*   r  r    s<         lq$ $$/7/E$ZbcgZh$ $ $ $ $ $P 1515,059,0/3&*@
 @
E,-@
 !.@
 EL)	@

   12@
 $D>@
 'tn@
 d^@
 
uo%	&@
 @
 @
 @
 @
 @
 @
 @
r,   r  c                       e Zd ZdZ	 ddedeej                 dee         f fdZ		 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                 d	ee
j                 d
ee
j                 dee
j                 dee
j                 dee
j                 dee         dee
j                 dee         dee         dee         dee         dee
j                 deeef         fdZ xZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFr   r  r  c                 *   t                                                     j        | _        j        | _        j        | _        j        | _        j	        rt          j        j                  nd| _        ||| _        n*t          j        j        j        | j                  | _        t%          j        j                  | _        t          j        fdt+          j                  D                       | _        t          j        j                  | _        || _        |rLj        | _        t9          j        j                  | _        t9          j        j                  | _        d| _         | !                                 d S )Nr  c                 2    g | ]}t          |           S ))rW   )r   )r
  ir   s     r*   r  z'MvpDecoder.__init__.<locals>.<listcomp>  s&    $p$p$pa_Vq%I%I%I$p$p$pr,   F)"r5   r6   rT   decoder_layerdropr  r    r   r  max_target_positionsr  r  r  r   r  r  r   r   r  r.   r  r  r  decoder_layersr{   r   r  r  r   r   r   r   r   r  r  )r7   r   r  r  r8   s    `  r*   r6   zMvpDecoder.__init__  s}    	   ~1!.$*$B!8>8NW49V^444TW# ,D "V->PTP` a aD<*N 
  
 m$p$p$p$pSXY_YnSoSo$p$p$pqq#%<#?#? $ 	!'!5D$-%.% %D!
 &/%.& &D" ',#r,   r   ri   r   r   r  cross_attn_head_maskrb   r  r   rl   r  r  rm   rn   c                 p   |
|
n| j         j        }
||n| j         j        }|	|	n| j         j        }	||n| j         j        }||t          d          |&|}|j        }|                    d|d                   }n=|,|                                dd         }|dddddf         }nt          d          || 	                    |          | j
        z  }| j        r%| j        r|	rt                              d           d}	|	rO|M|6t          t!          | j                   t!          | j                             nt!          | j                   }|	rCt#          |t$                    r.t                              d           t          j        |          }||                                nd	}t+          ||||          }||t-          ||j        |d         
          }|                     ||          }||z   }|                     |          }t4          j                            || j        | j                  }| j        r[t=          j        | j                   !                    | j"                  }| #                    |          }| $                    |          }|rdnd}|
rdnd}|
r|dnd}tK          ||gddg          D ]z\  }}|s|                                d	         tM          | j'                  k    rCt          d| dtM          | j'                   d|                                d	          d          {tQ          | j'                  D ]\  }}|r||fz  }| j        r t=          j)        g           }|| j*        k     r4 ||||||||         nd|||         nd| j        r||         nd| j        r||         nd||
|	|          }|d	         }|
r||d         fz  }|||d         fz  }|r||fz  }|st%          d |||||fD                       S tW          |||||          S )aE  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer#   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r   rr   r  r  rB  zThe `z` should be specified for r!  r"  )	r   rj   r   r   r   rb   rl   r   rm   r   r3   c              3      K   | ]}||V  	d S r   r  r$  s     r*   r&  z%MvpDecoder.forward.<locals>.<genexpr>  s0        =  === r,   )r(  rb   rg   r)  cross_attentions),r   rl   r  r   r*  r'   r%   r~   ru   r  r  r  rt   loggerwarning_oncer   r   rv   r   from_legacy_cacheget_seq_lengthr   r   r<   r  r  r   r   rT   r  r>   r?   r   r   r=   r   r   zipr+  r{   r,  r-  r  r   )r7   r   ri   r   r   r  rB  rb   r  r   rl   r  r  rm   r.  r/  r9   	positionsrg   r   r   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer3  decoder_layerr6  r7  s                                  r*   rD   zMvpDecoder.forward  s   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]  ]%>sttt"E#/K!r;r?;;II&',,..ss3K!!!!QQQ(+EEdeee  --i884;KKM& 	"4= 	" "##p   "	 	0 )4 $L$D$D$DlZ^ZeFfFfFfggg!555 
  	UOU;; 	U\  
 2COTTOETE`!?!?!A!A!Afg:K8N
 

 !,1G1S%?&(;[QS_& & &"
 ((0FGG	%	100??--mt|VZVc-dd ? 	Cd&899<<T[IIJ#44Z@@ $ 6 6z B B #7@BBD0:d&7h<Q<]rrdh %(4H(IKYoKp$q$q 	 	 Iy$>>##A&3t{+;+;<<$3	 3 3SEUEU 3 3%NN,,Q/3 3 3  
 #,DK"8"8 	@ 	@C# 6!m%55!} &+jnn#&77)M%'=3<3H3dI]Ii,@,E,Eos;??"T"23"7"7PT=A_#V#4S#9#9RV /"3#-  M *!,M  @=#3"55(4(]1-=,??(   	2-!11 	  ':K^]qr     
 9+++%1
 
 
 	
r,   r8  )NNNNNNNNNNNNN)rG   rH   rI   rJ   r   r   r   r   r   r6   r>   r9  rL   r   r   r   r   r   rD   rM   rN   s   @r*   r;  r;    s         lq& &&/7/E&ZbcgZh& & & & & &T 1515=A=A,07;+/59$(,0/3&*15Q
 Q
E,-Q
 !.Q
  ((9:	Q

 !))9 :Q
 EL)Q
 'u|4Q
 "%Q
   12Q
 D>Q
 $D>Q
 'tnQ
 d^Q
 !.Q
 
u??	@Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
r,   r;  c            &           e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeej                          dee         deej                 deej                 dee         dee         dee         dee         deej                 deeef         f"d            Z xZS )MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                 f   t                                          |           |j        |j        }}|j        | _        t          j        ||j        |          | _        t          || j        |j                  | _
        t          || j        |j                  | _        |                                  d S r   )r5   r6   r    r  r  r   r   r   sharedr  encoderr;  decoderr  )r7   r   r   r  r8   s       r*   r6   zMvpModel.__init__  s       "("5v7HZ +l:v~{KK!&$+v7HII!&$+v7HII 	r,   c                     | j         S r   )rX  r7   s    r*   get_input_embeddingszMvpModel.get_input_embeddings  s
    {r,   c                 X    || _         | j         | j        _        | j         | j        _        d S r   )rX  rY  r  rZ  r7   values     r*   set_input_embeddingszMvpModel.set_input_embeddings  s'    $(K!$(K!!!r,   c                     | j         S r   )rY  r\  s    r*   get_encoderzMvpModel.get_encoder  s
    |r,   c                    | j         s
J d            |                     d           | j        j                            d           | j        j                            d           | j        j                            d           d S )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r  requires_grad_rY  r   rZ  r   r\  s    r*   set_lightweight_tuningzMvpModel.set_lightweight_tuning  s}    jj jjjE"""%44T:::%44T:::&55d;;;;;r,   Nr   ri   decoder_input_idsdecoder_attention_maskr  decoder_head_maskrB  encoder_outputsrb   r  decoder_inputs_embedsr   rl   r  r  rm   rn   c                    |8|6|t          d          t          || j        j        | j        j                  }||n| j        j        }||n| j        j        }||n| j        j        }||n| j        j        }|| 	                    ||||
|||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|                     |||d         ||||	||||||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        	          S )
a*  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   ri   r  r  rl   r  r  r   r   r3   r'  )r   ri   r   r   r  rB  rb   r  r   rl   r  r  rm   )r(  rb   decoder_hidden_statesdecoder_attentionsrE  encoder_last_hidden_stater   encoder_attentions)r'   r+   r   r    r!   rl   r  r   r*  rY  rv   r   r+  rZ  r   r(  rb   rg   r)  rE  )r7   r   ri   rg  rh  r  ri  rB  rj  rb   r  rk  r   rl   r  r  rm   decoder_outputss                     r*   rD   zMvpModel.forward  s   f $)>)F  U   !34;3T[5W! ! 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]""ll#-#+"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O ,,'1"1!"4#1'!5+//!5#) ' 
 
   	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r,   NNNNNNNNNNNNNNNN)rG   rH   rI   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r6   r]  ra  rc  rf  r   r   r>   r9  rL   listr   r   r   r   r   r   rD   rM   rN   s   @r*   rS  rS    s%       *=)>&79VWy        0 0 0
  < < <  15158<=A,0487;=A+/59=A$(,0/3&*15#t
 t
E,-t
 !.t
 $E$45	t

 !))9 :t
 EL)t
 $EL1t
 'u|4t
 "$u'8"9:t
 "%t
   12t
  ((9:t
 D>t
 $D>t
 'tnt
  d^!t
" !.#t
$ 
u((	)%t
 t
 t
 ^t
 t
 t
 t
 t
r,   rS  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            (       ~    e Zd Zg dZdef fdZd Zd Z	 d"ded	e	e         d
e
dej        f fdZdeddfdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#de	ej                 de	ej                 de	ej                 de	ej                 de	ej                 de	ej                 de	ej                 de	eej                          de	e         de	ej                 de	ej                 de	ej                 de	e
         de	e
         de	e
         de	e
         de	ej                 deeef         f$d             Zdej        fd!Z xZS )$MvpForConditionalGeneration)rU  rV  lm_head.weightr   c                 l   t                                          |           t          |          | _        |                     dt          j        d| j        j        j        f                     t          j
        |j        | j        j        j        d          | _        |                                  d S )NrT  r   FrY   )r5   r6   rS  r   register_bufferr>   r   rX  r/   r   r\   r   lm_headr  r   s     r*   r6   z$MvpForConditionalGeneration.__init__f  s       f%%
0%+q$*BSBb>c2d2deeey1B1QX]^^^ 	r,   c                 4    | j                                         S r   )r   rc  r\  s    r*   rc  z'MvpForConditionalGeneration.get_encodero      z%%'''r,   c                 4    | j                                         S r   )r   get_decoderr\  s    r*   r  z'MvpForConditionalGeneration.get_decoderr  r~  r,   NTnew_num_tokenspad_to_multiple_ofmean_resizingrn   c                 x    t                                          |||          }|                     |           |S r   )r5   resize_token_embeddings_resize_final_logits_bias)r7   r  r  r  new_embeddingsr8   s        r*   r  z3MvpForConditionalGeneration.resize_token_embeddingsu  s<     88I[]jkk&&~666r,   c                    | j         j        d         }||k    r| j         d d d |f         }nBt          j        d||z
  f| j         j                  }t          j        | j         |gd          }|                     d|           d S )Nr#   r   r   rp   rT  )rT  r%   r>   r   r=   r   r{  )r7   r  old_num_tokensnew_bias
extra_biass        r*   r  z5MvpForConditionalGeneration._resize_final_logits_bias|  s    /5b9^++-aaa..@AHHa.)H%IRVRhRopppJy$"8*!E1MMMH0(;;;;;r,   c                 l    | j                                          | j                            d           d S r8  r   rf  r|  re  r\  s    r*   rf  z2MvpForConditionalGeneration.set_lightweight_tuning  2    
))+++##E*****r,   r   ri   rg  rh  r  ri  rB  rj  rb   r  rk  labelsr   rl   r  r  rm   c                    ||n| j         j        }|G|rt                              d           d}|'|%t	          || j         j        | j         j                  }|                     |||||||||	|
||||||          }|                     |d                   | j	        z   }d}|Kt                      } ||                    d| j         j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j        |j        |j        |j        	  	        S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)ri   rg  rj  rh  r  ri  rB  rb   r  rk  r   rl   r  r  rm   r   r#   r   	losslogitsrb   rm  rn  rE  ro  r   rp  )r   r*  rF  warningr+   r    r!   r   r|  rT  r   r~   r  r   rb   rm  rn  rE  ro  r   rp  )r7   r   ri   rg  rh  r  ri  rB  rj  rb   r  rk  r  r   rl   r  r  rm   r   	lm_logitsmasked_lm_lossloss_fctoutputs                          r*   rD   z#MvpForConditionalGeneration.forward  s   d &1%<kk$+B] mklllI (-B-J$6DK4dk6X% %! **)/+#9/!5+'"7/!5#)!  
 
$ LL,,t/EE	'))H%XinnR9O&P&PRXR]R]^`RaRabbN 	Z\GABBK/F3A3M^%..SYY#3")"?&9$5&-&G")"?&9

 

 

 
	
r,   c                 L    t          || j        j        | j        j                  S r   )r+   r   r    r!   )r7   r  s     r*   %prepare_decoder_input_ids_from_labelszAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s    !&$+*BDKDfgggr,   )NT)NNNNNNNNNNNNNNNNN)rG   rH   rI   rt  r   r6   rc  r  rK   r   r   r   r   r  r  rf  r   r>   r9  rL   ru  r   r   r   r   r   rD   r  rM   rN   s   @r*   rx  rx  ^  s        jiiy      ( ( (( ( ( dh !7?}\`	     < < < < < <+ + +  15158<=A,0487;=A+/59=A-1$(,0/3&*15%C
 C
E,-C
 !.C
 $E$45	C

 !))9 :C
 EL)C
 $EL1C
 'u|4C
 "$u'8"9:C
 "%C
   12C
  ((9:C
 )*C
 D>C
 $D>C
  'tn!C
" d^#C
$ !.%C
& 
uo%	&'C
 C
 C
 ^C
JhEL h h h h h h h hr,   rx  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $           e Zd ZddgZdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j
                 dee	j                 dee	j                 dee	j                 deee	j                          dee	j                 dee	j                 dee	j
                 dee         dee         dee         dee         deeef         f d            Z xZS )MvpForSequenceClassificationrU  rV  r   c                      t                      j        |fi | t          |          | _        t	          |j        |j        |j        |j                  | _        | 	                                 d S r   )
r5   r6   rS  r   r   r   
num_labelsclassifier_dropoutclassification_headr  )r7   r   kwargsr8   s      r*   r6   z%MvpForSequenceClassification.__init__  sq    **6***f%%
#8NN%	$
 $
  	r,   c                 l    | j                                          | j                            d           d S r8  )r   rf  r  re  r\  s    r*   rf  z3MvpForSequenceClassification.set_lightweight_tuning)  s3    
))+++ //66666r,   Nr   ri   rg  rh  r  ri  rB  rj  r  rk  r  r   rl   r  r  rn   c                    ||n| j         j        }|d}||	t          d| j        j                   |                     |||||||||	|
||||          }|d         }|                    | j         j                                      |j	                  }t          t          j        |                    d                              dk    rt          d          ||ddf                             |                    d          d|                    d                    dddddf         }|                     |          }d}|n| j         j        p| j         j        dk    rd	| j         _        nS| j         j        dk    r7|j        t          j        k    s|j        t          j        k    rd
| j         _        nd| j         _        | j         j        d	k    r\t/                      }| j         j        dk    r1 ||                                |                                          }n |||          }n| j         j        d
k    rLt3                      } ||                    d| j         j                  |                    d                    }n*| j         j        dk    rt5                      } |||          }|s|f|dd         z   }||f|z   n|S t7          |||j        |j        |j        |j        |j         |j!        |j"        	  	        S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for ri   rg  rh  r  ri  rB  rj  r  rk  r   rl   r  r  r   r   z7All examples must have the same number of <eos> tokens.r#   
regressionsingle_label_classificationmulti_label_classificationr  )#r   r*  NotImplementedErrorr8   rG   r   eqeos_token_idr   r=   r+  r>   unique_consecutivesumr'   r~   ru   r  problem_typer  r<   r@   rK   r   squeezer   r   r   rb   rm  rn  rE  ro  r   rp  )r7   r   ri   rg  rh  r  ri  rB  rj  r  rk  r  r   rl   r  r  r   rg   eos_masksentence_representationr  r  r  r  s                           r*   rD   z$MvpForSequenceClassification.forward-  s@   Z &1%<kk$+B]I!:%d4>Kbdd   **)/#9/!5+'"7/!5#  
 
   
<< 899<<]=QRRu'Q8899A==VWWW"/!!!"<"A"A-BTBTUVBWBWY[]j]o]opr]s]s"t"tAAr111H#
 ))*ABB{'/;)Q../;DK,,[+a//V\UZ5O5OSYS_chclSlSl/LDK,,/KDK,{'<77"99;)Q..#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB0F G GUWYY)-III,..x// 	FY,F)-)9TGf$$vE.#3")"?&9$5&-&G")"?&9

 

 

 
	
r,   )NNNNNNNNNNNNNNN)rG   rH   rI   rt  r   r6   rf  r   r   r>   r9  rL   ru  r   r   r   r   r   rD   rM   rN   s   @r*   r  r    s        89VWy      7 7 7  15158<=A,0487;=A59=A-1$(,0/3&*!T
 T
E,-T
 !.T
 $E$45	T

 !))9 :T
 EL)T
 $EL1T
 'u|4T
 "$u'8"9:T
   12T
  ((9:T
 )*T
 D>T
 $D>T
 'tnT
  d^!T
" 
u55	6#T
 T
 T
 ^T
 T
 T
 T
 T
r,   r  c            &           e Zd ZddgZ fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej	                 deej
                 d	eej
                 d
eej	                 deej	                 deej	                 deeej                          deej
                 deej
                 deej                 deej                 dee         dee         dee         dee         deeef         f"d            Z xZS )MvpForQuestionAnsweringrU  rV  c                    t                                          |           d|_        |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S r2   )
r5   r6   r  rS  r   r   r\   hidden_size
qa_outputsr  r   s     r*   r6   z MvpForQuestionAnswering.__init__  sm        +f%%
)F$68IJJ 	r,   c                 l    | j                                          | j                            d           d S r8  )r   rf  r  re  r\  s    r*   rf  z.MvpForQuestionAnswering.set_lightweight_tuning  s2    
))+++&&u-----r,   Nr   ri   rg  rh  r  ri  rB  rj  start_positionsend_positionsr  rk  r   rl   r  r  rn   c                    ||n| j         j        }|	|
d}|                     ||||||||||||||          }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}|	|
t          |	                                          dk    r|	                    d          }	t          |
                                          dk    r|
                    d          }
|                    d          }|		                    d|          }	|
	                    d|          }
t          |          } |||	          } |||
          }||z   d	z  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        |j        |j        |j        |j        |j        

  
        S )a`  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr  r   r   r#   rp   )ignore_indexr3   )
r  start_logits
end_logitsrb   rm  rn  rE  ro  r   rp  )r   r*  r   r  r   r  
contiguousr+  ru   r   r   r   rb   rm  rn  rE  ro  r   rp  )r7   r   ri   rg  rh  r  ri  rB  rj  r  r  r  rk  r   rl   r  r  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                               r*   rD   zMvpForQuestionAnswering.forward  s[   f &1%<kk$+B]&=+DI**)/#9/!5+'"7/!5#  
 
" "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R F 0:/EZMF**6Q2%!#3")"?&9$5&-&G")"?&9
 
 
 	
r,   rr  )rG   rH   rI   rt  r6   rf  r   r   r>   rL   r9  ru  r   r   r   r   r   rD   rM   rN   s   @r*   r  r    s       79VW
 
 
 
 
. . .  -1158<=A,0487;=A6:4859=A$(,0/3&*#Q
 Q
EL)Q
 !.Q
 $E$45	Q

 !))9 :Q
 EL)Q
 $EL1Q
 'u|4Q
 "$u'8"9:Q
 "%"23Q
   01Q
   12Q
  ((9:Q
 D>Q
 $D>Q
  'tn!Q
" d^#Q
$ 
u99	:%Q
 Q
 Q
 ^Q
 Q
 Q
 Q
 Q
r,   r  c                   (     e Zd ZdZ fdZd Z xZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                 r    t                                          |           t          |          | _        d S r   )r5   r6   r;  rZ  r   s     r*   r6   zMvpDecoderWrapper.__init__u  s.       !&))r,   c                      | j         |i |S r   )rZ  )r7   argsr  s      r*   rD   zMvpDecoderWrapper.forwardy  s    t|T,V,,,r,   )rG   rH   rI   rJ   r6   rD   rM   rN   s   @r*   r  r  o  sQ         
* * * * *- - - - - - -r,   r  c            "           e Zd ZdgZ fdZd Zd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                 d
eej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 dee         dee         dee         dee         deej                 deeef         fd            Z xZS )MvpForCausalLMry  c                    d|_         d|_        t                                          |           t	          |          | _        t          j        |j        |j	        d          | _
        |                                  d S )NTFrY   )rU   is_encoder_decoderr5   r6   r  r   r   r\   r  r  r|  r  r   s     r*   r6   zMvpForCausalLM.__init__  sp     $)!   &v..
y!3V5FUSSS 	r,   c                 $    | j         j        j        S r   r   rZ  r  r\  s    r*   r]  z#MvpForCausalLM.get_input_embeddings  s    z!..r,   c                 (    || j         j        _        d S r   r  r_  s     r*   ra  z#MvpForCausalLM.set_input_embeddings  s    */
'''r,   c                     || j         _        d S r   r   rZ  )r7   rZ  s     r*   set_decoderzMvpForCausalLM.set_decoder  s    $
r,   c                     | j         j        S r   r  r\  s    r*   r  zMvpForCausalLM.get_decoder  s    z!!r,   c                 l    | j                                          | j                            d           d S r8  r  r\  s    r*   rf  z%MvpForCausalLM.set_lightweight_tuning  r  r,   Nr   ri   r   r   r  rB  rb   r  r  r   rl   r  r  rm   rn   c                    ||n| j         j        }||n| j         j        }||n| j         j        }| j                            |||||||||
|||          }|                     |d                   }d}|	Kt                      } ||                    d| j         j	                  |	                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j                  S )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```N)r   ri   r   r   r  rB  rb   r  r   rl   r  r  r   r#   r   )r  r  rb   rg   r)  rE  )r   rl   r  r*  r   rZ  r|  r   r~   r  r   rb   rg   r)  rE  )r7   r   ri   r   r   r  rB  rb   r  r  r   rl   r  r  rm   r   r  r  r  r  s                       r*   rD   zMvpForCausalLM.forward  sR   Z 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] *$$)"7#9!5+'/!5# % 
 
 gaj))'))H8FKKDK,BCCV[[QS__UUD 	DY,F'+'7D7V##VC0#3!/)$5
 
 
 	
r,   )NNNNNNNNNNNNNN)rG   rH   rI   rt  r6   r]  ra  r  r  rf  r   r   r>   r9  rL   r   r   r   r   r   r   rD   rM   rN   s   @r*   r  r  }  s       *+	 	 	 	 	/ / /0 0 0% % %" " "+ + +  1515=A>B,07;+/59-1$(,0/3&*15T
 T
E,-T
 !.T
  ((9:	T

 !)): ;T
 EL)T
 'u|4T
 "%T
   12T
 )*T
 D>T
 $D>T
 'tnT
 d^T
 !.T
  
u77	8!T
 T
 T
 ^T
 T
 T
 T
 T
r,   r  )r  rx  r  r  rS  r   )ArJ   r  typingr   r   r>   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mvpr   
get_loggerrG   rF  rL   rK   r+   r   r.   ModulerP   r   r   r   r   r   r  r;  rS  rx  r  r  r  r  __all__r  r,   r*   <module>r     s@      " " " " " " " "        A A A A A A A A A A ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) )        : 9 9 9 9 9                  . - - - - - , , , , , , , , 0 0 0 0 0 0 ( ( ( ( ( ( 
	H	%	%%, c [^    "; ; ; ; ;BL ; ; ;6^2 ^2 ^2 ^2 ^229 ^2 ^2 ^2B@+ @+ @+ @+ @+0 @+ @+ @+Fs s s s s0 s s sn    BI   0    	   2        6q
 q
 q
 q
 q
# q
 q
 q
hC
 C
 C
 C
 C
# C
 C
 C
L Y
 Y
 Y
 Y
 Y
! Y
 Y
 Y
x   
mh mh mh mh mh"4o mh mh 
mh`   i
 i
 i
 i
 i
#5 i
 i
 i
X e
 e
 e
 e
 e
0 e
 e
 e
R- - - - -* - - -s
 s
 s
 s
 s
' s
 s
 s
l  r,   