
     `i{                       d Z ddlZddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#  ej$        e%          Z&dBdZ'd Z(dBdZ)d Z*e ed           G d de                                  Z+e ed           G d de                                  Z,e ed           G d d e                                  Z-e ed           G d! d"e                                  Z.e G d# d$e                      Z/ G d% d&ej0                  Z1 G d' d(ej2                  Z3 G d) d*ej2                  Z4 G d+ d,ej2                  Z5 G d- d.e          Z6 G d/ d0e          Z7 ed1           G d2 d3e/                      Z8 ed4           G d5 d6e/                      Z9e G d7 d8e/                      Z: ed9           G d: d;e/e                      Z; ed<           G d= d>e/e                      Z< G d? d@e/          Z=g dAZ>dS )CzRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)OptionalUnion)Tensornn)	LayerNorm   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging)deprecate_kwarg   )ProphetNetConfigFc                     |r3t           j                            |                                 |          S t           j                            | |t          j                  S )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_traces      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   )   sQ     Q}$$\%7%7%9%9s$CCC}$$\s%-$PPP    c                    t          j        || | f||          t          j        |          j        z  }|                                                                }t          |          D ]>}||                             dd           ||                             | dz              ?d|dddddf<   t          j	        ||gd          S )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r    onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr'   r   
left_blockright_block
stream_idxs          r$   ngram_attention_biasr8   0   s    
 	
E?O<VSXYYY\a\ghm\n\n\rr  ##%%++--KEll 6 6
J..qu.===:$$j[1_5555Jqqq!!!Qw9j+.A6666r%   c                    | }d}|rY| dz  } |t          j        |t          j        |                                                    | z  z   }t          j        |          }n't          j        |t          j        |                    }| dz  }t          j        ||          }|t          j        |                                |z            t          j        ||z            z  | |z
  z  z   }t          j	        |t          j
        |          | dz
  z                                            }|t          j        ||                                |          z   }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r)   r   )r    lt
zeros_likeintabsmaxlogr   mathr,   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larges	            r$   compute_relative_bucketsrL   A   sg    10 m!Q& h-u/?@V/W/WXX\\^^allm 	 "'+A!B!B!&+A5CSTjCkCk!l!lq Ix.	::Huy)?)E)E)G)G))STTW[W_y X X  	y	  " "L 9\5?<+H+HKZ[O+\]]aaccL/%+hH^HbHbHdHdfr2s2ssr%   c                    |                     d                              d|                    d          d          }||                     d          z
  }t          j        |dz
  |fd                               d          }|                    d|                    d          d          }||                     d          z
  }t          | ||d          }t          | ||d          }||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)rF   )	unsqueezerepeatsizer    r2   rL   )rC   rD   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketss          r$   #compute_all_stream_relative_bucketsrW   \   s   
 &2%;%;A%>%>%E%EaIZIZ[]I^I^`a%b%b"%ClF\F\]_F`F`%`" ,19lQ6F5U[]+^+^+^+h+hij+k+k(+O+V+VWXZfZkZklnZoZoqr+s+s(+OR^RhRhikRlRl+l( &>\#ATY& & &" )A\#GZ_) ) )% *+LLLr%   zF
    Base class for sequence-to-sequence language models outputs.
    )custom_introc                   $   e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dZeeej                          ed
<   dZeeej                          ed<   dZeej                 ed<   dZeeej                          ed<   dZeeej                          ed<   ed             ZdS )ProphetNetSeq2SeqLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsc                 D    t          j        dt                     | j        S Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.warningswarnFutureWarningrc   selfs    r$   decoder_cross_attentionsz2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions   )    	
 	
 	

 $$r%   )__name__
__module____qualname____doc__r[   r   r    FloatTensor__annotations__r\   r]   r^   r   r_   tupler`   ra   rb   rc   rd   re   rf   propertyro    r%   r$   rZ   rZ   s   s         < )-D(5$
%,,,*.FHU&'...04L(5,-444'+OXe_+++@D8E%*;$<=DDDFJ%0A*B!CJJJ=Au'8!9:AAACGhuU->'?@GGG;?huU%678???=Ax(9:AAA@D8E%*;$<=DDD=Au'8!9:AAA% % X% % %r%   rZ   z
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.
    c                      e Zd ZU dZej        ed<   dZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dZeeej                          ed
<   dZeej                 ed<   dZeeej                          ed<   dZeeej                          ed<   ed             ZdS )ProphetNetSeq2SeqModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    last_hidden_stateNlast_hidden_state_ngramr^   r_   r`   ra   rb   rc   rd   re   rf   c                 D    t          j        dt                     | j        S rh   ri   rm   s    r$   ro   z5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions   rp   r%   )rq   rr   rs   rt   r    ru   rv   r}   r   r^   r   r_   rw   r`   ra   rb   rc   rd   re   rf   rx   ro   ry   r%   r$   r{   r{      sb         : ((((;?Xe&78???'+OXe_+++@D8E%*;$<=DDDFJ%0A*B!CJJJ=Au'8!9:AAACGhuU->'?@GGG;?huU%678???=Ax(9:AAA@D8E%*;$<=DDD=Au'8!9:AAA% % X% % %r%   r{   zs
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   V   e Zd ZU dZej        ed<   dZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dZeeej                          ed
<   dS )ProphetNetDecoderModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    r|   Nr}   r^   hidden_stateshidden_states_ngram
attentionsngram_attentionsrc   )rq   rr   rs   rt   r    ru   rv   r}   r   r^   r   r   rw   r   r   r   rc   ry   r%   r$   r   r      s          6 ((((;?Xe&78???'+OXe_+++8<M8E%"345<<<>B%(9":;BBB59Ju012999;?huU%678???;?huU%678?????r%   r   c                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dZeeej                          ed
<   dZeeej                          ed<   dS )ProphetNetDecoderLMOutputa	  
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    Nr[   r\   r]   r^   r   r   r   r   rc   )rq   rr   rs   rt   r[   r   r    ru   rv   r\   r]   r^   r   r   rw   r   r   r   rc   ry   r%   r$   r   r     s           D )-D(5$
%,,,*.FHU&'...04L(5,-444'+OXe_+++8<M8E%"345<<<>B%(9":;BBB59Ju012999;?huU%678???;?huU%678?????r%   r   c                   .    e Zd ZU eed<   dZdZd Zd ZdS )ProphetNetPreTrainedModelconfig
prophetnetTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 d S d S t          |t          j
                  r]|j        j                            d| j        j                   |j        -|j        j        |j                 	                                 d S d S d S )N        )meanstd)
isinstancer   Linearweightdatanormal_r   init_stdbiaszero_	Embeddingpadding_idx)rn   modules     r$   _init_weightsz'ProphetNetPreTrainedModel._init_weightsP  s    fbi(( 	?M&&CT[5I&JJJ{& &&((((( '&-- 	?M&&CT[5I&JJJ!-"6#56<<>>>>>	? 	?--r%   c                    | j         j        }| j         j        }|
J d            |                    |j                  }|dd df                                         |ddd f<   ||d<   |
J d            |                    |dk    |           t          j        |dk              	                                s
J d	            |S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rN   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r   decoder_start_token_idpad_token_id	new_zerosshaper.   masked_fill_r    allitem)rn   	input_idsr   r   shifted_input_idss        r$   _shift_rightz&ProphetNetPreTrainedModel._shift_rightZ  s    !%!C{/%11F 211 &//	@@%.sCRCx%8%>%>%@%@#qrr'"$:&!'')\'''&&'8D'@,OOOy*a/005577ss9sss7  r%   N)	rq   rr   rs   r   rv   base_model_prefixsupports_gradient_checkpointingr   r   ry   r%   r$   r   r   J  sL         $&*#? ? ?! ! ! ! !r%   r   c                   B     e Zd ZdZdeddf fdZd fd	Z fdZ xZS )	ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    r   returnNc                     |j         | _        t                                          |j         |j        |j                   d S N)max_position_embeddings
max_lengthsuper__init__hidden_sizer   rn   r   	__class__s     r$   r   z'ProphetNetPositionalEmbeddings.__init__x  s8     879KVM`aaaaar%   c                 h   || j         
J d            ||q|                                dk    rY|                                }|d         |z   }t          j        dt          j        |          t          | j         |z             z  }n|!t          j        |t          j        |          }t          j        |d                              |          |z                                  | j         z   }|                    d| j	        dz
            }t                                          |          |fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r   )r   r   r   r'   r   )r   get_seq_lengthr    r*   longr<   cumsumtype_asclampr   r   forward)	rn   inputs_shaper'   attention_maskr^   rR   prev_num_input_idsnum_input_idsr   s	           r$   r   z&ProphetNetPositionalEmbeddings.forward|  s4   $$*:*B*BQ +C*BC */M/M/O/OST/T/T &5%C%C%E%E" ,Q2D D$z&
6RRR(=899  ")%*ZEJW]%^%^%^N LQ777??OOR``$&&4+ ,
  ,11!T_q5HIIww|,,l::r%   c                 F    t                                          |          S r   )r   r   )rn   rR   r   s     r$   _forwardz'ProphetNetPositionalEmbeddings._forward  s    ww|,,,r%   )NNN)	rq   rr   rs   rt   r   r   r   r   __classcell__r   s   @r$   r   r   q  s         b/ bD b b b b b b; ; ; ; ; ;8- - - - - - - - -r%   r   c                       e Zd ZdZddededee         f fdZ eddd	
          	 	 	 	 	 	 ddee	         dee	         dee	         dee
         dee         deej	                 dee	ee	         f         fd            Z xZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   num_attn_heads	layer_idxc                    t                                                       |j        }|j        | _        |j        | _        || _        ||z  | _        || _        | j        |z  |k    s
J d            t          j	        ||          | _
        t          j	        ||          | _        t          j	        ||          | _        t          j	        ||          | _        d S )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r   r   key_proj
value_proj
query_projout_proj)rn   r   r   r   r   r   s        r$   r   zProphetNetAttention.__init__  s    (!'!9~,#~5"}~-<<<4 =<<
 	+{;;)K==)K==	+{;;r%   past_key_valuer^   4.58new_nameversionFkey_value_statesr   layer_head_maskoutput_attentionscache_positionr   c                 	   |                                 \  }}	}
|d u}t          |                                           ||	|
gk    s%J d||	|
f d|                                              |                     |          | j        dz  z  }d}|Ht	          |t
                    r1|j                            | j                  }|r|j	        }n
|j
        }n|}|r|n|}|r3|1|r/|j        | j                 j        }|j        | j                 j        }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|N|s|nd }|                    ||| j        d|i          \  }}|r$t	          |t
                    rd	|j        | j        <   |                    ||	| j        | j                                      dd          }|                     d          }t)          j        d
||                    dd                    }|| j        |	|f}|                                 |k    r't-          d| d|                                            ||                                dk    rd }|| j        d|f}|?|                                 |k    r't-          d| d|                                            |||z   }|r|}nd }t0          j                            |d          }||                                 | j        fk    s(J d| j        f d|                                              |                    dddd          |                    || j        |	|          z  }|                    dddd          |z  }t0          j                            || j        | j                  }t)          j        d
||          }|| j        |	| j        f}|                                 |k    r't-          d| d|                                            |                    dd                              ||	|
          }|                     |          }t0          j                            || j        | j                  }||fS )Nz Size of hidden states should be 	, but is       ?FrN   r   r)   r   Tzbsij,bsjk->bsikr	   z#Attention weights should have size r   z Attention mask should have size r   /Head mask for a single layer should be of size ptrainingz `attn_output` should have shape , but is of shape ) rQ   listr   r   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   viewr   	transposeupdater    einsum
ValueErrorr   r   r   r   r   r   r   reshaper   )rn   r   r   r   r   r^   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statessrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputs                           r$   r   zProphetNetAttention.forward  s    ,9+=+=+?+?(
G[ .T9M&&(()).
 
 
 
 pj';-OooYfYkYkYmYmoo	
 
 
 }559KL
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#-?R))] 	F/"=*"=,3DNCHJ.5dnELLL~66J??>::L#R9Ldm\\ffghjkllJ',,ZT=PRVR_``jjklnoppL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~>#((Wd>QSWS`aakklmopqq//!$$|$5|ZEYEYZ[]^E_E_``$d&97GL.00q>qq\h\m\m\o\oqqrrr %.*<*<*>*>!*C*C!N$d&91gF%.*=*=*?*?>*Q*QpppYgYlYlYnYnppqqq%'.8L 	)$0!!$(!},,\r,BB&"''))d.A-CCCC-4CVBX - -#((**- - DCC +//2q!<<|?P?PD/'@ @ L
 %4$8$8B1$E$EH]$]!]**$] + 
 


 l#4j,OO$d&97DMR//vvvbmbrbrbtbtvvwww!++Aq1199*g{[[mmK00m++K4<RVR_+``111r%   r   )NNNNFN)rq   rr   rs   rt   r   r<   r   r   r   r   r   boolr    rw   r   r   r   s   @r$   r   r     s6       GG< </ < <QYZ]Q^ < < < < < <* _%0A6RRR .2+/,0+/,115k2 k2 #6*k2 !(	k2
 "&)k2 "%k2 $D>k2 !.k2 
vx''	(k2 k2 k2 SRk2 k2 k2 k2 k2r%   r   c                   2     e Zd ZdZdedef fdZd Z xZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    r   ffn_dimc                 "   t                                                       t          |j                 | _        t          j        |j        |          | _        t          j        ||j                  | _	        |j
        | _
        |j        | _        d S r   )r   r   r
   activation_functionactivation_fnr   r   r   intermediateoutputactivation_dropoutr   )rn   r   r  r   s      r$   r   zProphetNetFeedForward.__init__(  sn    #F$>?If&8'BBi);<<"(";~r%   c                 4   |                      |          }|                     |          }t          j                            || j        | j                  }|                     |          }t          j                            || j        | j                  }|S )Nr   )r  r  r   r   r   r  r   r  )rn   r   s     r$   r   zProphetNetFeedForward.forward0  s    ))-88**=99--mt?Vaean-ooM22--mt|VZVc-ddr%   )	rq   rr   rs   rt   r   r<   r   r   r   r   s   @r$   r   r   #  se         &/ &# & & & & & &      r%   r   c                        e Zd Zddef fdZd Zd Z eddd	          	 	 	 	 	 	 	 	 ddee	         fd
            Z
d Zd Z xZS )ProphetNetNgramSelfAttentionNr   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | j        z  | _	        |j
        | _
        || _        | j	        | j        z  |j        k    s
J d            t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j        | j        | j        z            | _        d| _        d S )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   rC   relative_max_distancenum_decoder_attention_headsr   r   r   r   r4   r   r   r   r   r   r   r   relative_pos_embeddingsr#   rn   r   r   r   s      r$   r   z%ProphetNetNgramSelfAttention.__init__;  s?   !-!-%+%A"$@~!'!9*d.AA\
"}t22f6HHHHD IHH 	&"4f6HII)F$68JKK)F$68JKK 	&"4f6HII (*y1CTEUX\XkEk'l'l$  r%   c                     |                     ||| j        | j                                      dd                                          S Nr   r)   )r   r   r   r   
contiguous)rn   tensorseq_lenr   s       r$   _shapez#ProphetNetNgramSelfAttention._shapeY  s=    {{:w0CT]SS]]^_abccnnpppr%   c                     d| _         d S )NT)r#   rm   s    r$   prepare_for_onnx_export_z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_\  s    r%   r   r^   r   r   c
                   *+ |                                 \  }
}}t          |                                           |
||gk    sJ d|
||f d|j                     |                     |          }|                     |          }|                     |          }|| j        dz  z  }|                     |||
          }|                     |d|
          }|                     |d|
          }|
| j        d| j        f} |j	        | } |j	        | } |j	        | }|
                    d| j        z   d          }|
                    d| j        z   d          }|
                    d| j        z   d          }|
                    d| j        z   d          }|d         |dd          }}|d         |dd          }}|d         |dd          c*}|d         |dd          c+}|At          |t                    r|j        }n|}|                    *+| j        d	|	i          \  *+|d| j        z   z  }t#          j        d
|*                    dd                    }|                     ||||          }||z   }|||z   }t+          |d| j                                      |          }|||                                 | j        fk    s(J d| j        f d|                                              |                    dddd          |                    |
| j        d|          z  }t2          j                            || j        | j                  }t#          j        d
|+          } |                     dd          	                    |
d||          } |                     |           } t#          j        |d                              |
| j        | j        || j                  }!t#          j        *fd|D             d          }"t#          j        |d          }#t#          j         +fd|D             d          }$t#          j        d|!|"f          }%| !                    |#|%||          }&|%|&z   }%|8|"                    ddddd          }|#                    |%j$                  }|%|z   }%t+          |%d| j                                      |%          }'|b|                                 | j        fk    s(J d| j        f d|                                              |                    ddddd          |'z  }'t2          j                            |'| j        | j                  }'t#          j        d|'|$                    dd          f          }(|(                    dd          }(|(	                    |
| j        ||          }(|                     |(          }(t#          j         | |(gd                              |
d|          })|                    |
| j        |d          }t2          j                            |)| j        | j                  })|)||'fS )Nz#`hidden_states` should be of shape r   r   rN   r   r   r)   r   r   zbntc,bncs->bntsr	   )r   r#   r   r   r   c                 >    g | ]}t          j        |gd           S r)   )r    r2   ).0keymain_key_statess     r$   
<listcomp>z8ProphetNetNgramSelfAttention.forward.<locals>.<listcomp>  s+    )r)r)rSV%)_c4JA*N*N)r)r)rr%   c                 d    g | ],}t          j        |gd                               d           -S r  )r    r2   rO   )r  v_pmain_value_statess     r$   r  z8ProphetNetNgramSelfAttention.forward.<locals>.<listcomp>  s9    fffSUY)3/33==a@@fffr%   zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc)%rQ   r   r   r   r   r   r   r  r   r   chunkr4   r   r   r   r   r   r    r   r    get_main_relative_pos_embeddingsr   r#   r   r   r   r   r   r   r   r   stackr2   #get_predict_relative_pos_embeddingspermutetor   ),rn   r   r^   r   r   extended_predict_attention_maskrU   rV   rR   r   r   ngram_sequence_lengthr   r   r   r   
proj_shapehidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listpredict_key_states_listpredict_value_states_listr   r3   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_statespredict_key_statespredict_hidden_statespredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   r  r!  s,                                             @@r$   r   z$ProphetNetNgramSelfAttention.forward_  s    :G9K9K9M9M6
);M&&(())j:OQ\-]]]]&*>SU`1a & &#& & ^]] }55]]=11
}55 $t}c'9: {{<1F
SS[[R<<
{{<Z@@ $"5r4=I
+|+Z8'Z'4
+|+Z8 +00TZQ0GG(..q4:~1.EE$**1tz>q*AA(..q4:~1.EE9KA9NPbcdcecePf67H7KM^_`_a_aMb43B13EWXWYWYGZ007H7KM^_`_a_aMb44 &/+>?? 6&5&J##&5#1D1K1K!2DNEUWeDf2 2.O.
 0A
NC "L):<MOhOhijlmOnOnoo (,'L'L 1<A_(
 (
$ .0LL% 1N B!
 
 
 '#
$
$	 	 &"''))d.A-CCCC-4CVBX - -#((**- - DCC .221b!Q??/BVBVD/_C C O -//4CYdhdq/rr
 !<(9?L]^^+55a;;CCJPQSbdopp==)9::  %{+DaHHMM
D$7$- 
  

 #[)r)r)r)rZq)r)r)rtuvv !&,FA N N N  %yffffLefffhi 
  
  %|,@CWYkBlmm +/*R*R!#7Gh+
 +
'
  46UU*6.M.U.UVWYZ\]_`bc.d.d+.M.P.PQeQk.l.l+#7:Y#Y $ 
 
 
 '&
'
'	 	 &"''))d.A-CCCC-4CVBX - -#((**- - DCC "1!5!5aB1!E!EHZ!Z]22$"84= 3 
 
 $l #57K7U7UVWYZ7[7["\
 
 2;;AqAA199*djRacnoo"mm,?@@ i!13F GKKPPQ[]_almm)..z4;NP_acddm++K4<RVR_+``O-???r%   c                 L   |j         \  }}}}|                    ||||          }||j         d d         \  }}	t          j        d|j         d         dz                                 d                              d                              ||	d                              |j                  }
|
|                    d                              ||	d          z
  }
t          | j	        | j
        |
d          }|                     |          }|                    |j         d d         | j	        | j        fz             }|                    dddd          }|                    |j         d d         dz             }|                    d| j        d          }|                    d|j         d                   }|                                }|                    d|                    d                    }t          j        |d|          }|                    |||d          }|S )	Nr)   r   rN   r   Fr	   )rN   r   index)r   r   r    arangerO   rP   r(  r'   rL   rC   r  r  r   r'  r   r   rQ   gather)rn   r   r   rR   rU   r   r   r   r   r3   rE   rel_pos_embeddingsr7  s                r$   r$  z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddings  s9    8D7I4
NGW#((^WgVV)1*7*=bqb*A'JQ 22 6 :;;11
OQ77L'((  "4l6L6LQ6O6O6V6VWacrtu6v6v!v-E $"<>PRW. .*
 "99-HH/44$RaR(D,<d>Q+RR
 
 0771aCC/778J2A28NQV8VWW)G)N)NqRVRegh)i)i&)G)L)L.4R8*
 *
& *H)L)L)N)N&/77<N<S<STV<W<WXX',|4FAUs't't't$'C'H'HUcelnp'q'q$++r%   c                 `   |j         dd         \  }}||j         d         }|d         d         |dz
  k    s
J d            t          j        d|                              d                              d                              ||d                              |j                  }||                    d                              ||d          z
  }t          | j        | j	        |d          }|
                    dd          }|                     |          }	|	                    |j         d d         | j        | j        fz             }	|	                    ddddd          }	|	                    d| j                  }	|                    d          }|                    | j        d| j        d          }|                    d|                    d                                                    }t          j        |	d|	          }
|
                    || j        | j        |d          }
|
S )
Nr   r)   rN   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr"  r	   rC  )r   r    rE  rO   rP   r(  r'   rL   rC   r  r   r  r   r   r'  r   r4   rQ   r   rF  )rn   r   r   rR   rV   r   r3   key_sequence_lengthrE   rG  r?  s              r$   r&  z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsA  sU    '4&9!A#&>#
O,4"."4R"8?1%)<q)@@@@t A@@ Q 34411
OQ77L'((  "4l6L6LQ6O6O6V6VWacrtu6v6v!v0H $"<>PRW1 1-
 &//155!99-HH 044$(8$:M'NN
 
 0771aAFF/77D<LMM,M,W,WXY,Z,Z),M,T,TJ4.-
 -
) -N,R,R166r::-
 -

$&& 	* +0,A-N+
 +
 +
'
 +J*N*N
D$7"+
 +
' /.r%   r   )NNNNNNNN)rq   rr   rs   r   r   r  r  r   r   r   r   r$  r&  r   r   s   @r$   r
  r
  :  s           /            <q q q   _%0A6RRR ,0(,'+*.r@ r@ "%r@ r@ r@ SRr@h+, +, +,Z9/ 9/ 9/ 9/ 9/ 9/ 9/r%   r
  c                   8     e Zd ZdZdef fdZ	 ddefdZ xZS )ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    r   c                    t                                                       t          ||j                  | _        t          |j                  | _        t          ||j	                  | _
        t          |j                  | _        d S r   )r   r   r   num_encoder_attention_heads	self_attnr   r   self_attn_layer_normr   encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   s     r$   r   zProphetNetEncoderLayer.__init__  sp    ,VV5WXX$-f.@$A$A! 2&&:PQQ'01C'D'D$$$r%   Fr   c                     |                      ||||          \  }}|                     ||z             }|                     |          }|                     ||z             }|f}|r||fz  }|S )N)r   r   r   r   )rN  rO  rQ  rR  )	rn   r   r   r   r   attention_outputr   feed_forward_outputoutputss	            r$   r   zProphetNetEncoderLayer.forward  s     *.')+/	 *8 *
 *
&, 112B]2RSS #//>>445H=5XYY " 	'&Gr%   F)	rq   rr   rs   rt   r   r   r   r   r   r   s   @r$   rK  rK  }  s~         E/ E E E E E E #( 
         r%   rK  c            	            e Zd ZdZddef fdZ eddd          	 	 	 	 	 	 	 	 	 	 	 	 	 ddee         dee         dee	j
                 fd            Z xZS )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    Nr   c                    t                                                       t          ||          | _        t	          |j                  | _        |j        r5t          ||j	        |          | _
        t	          |j                  | _        t          ||j                  | _        t	          |j                  | _        d S )Nr   )r   r   r
  rN  r   r   rO  add_cross_attentionr   r  
cross_attncross_attn_layer_normr   decoder_ffn_dimrQ  rR  r  s      r$   r   zProphetNetDecoderLayer.__init__  s    5f	RRR$-f.@$A$A! % 	G1&&:\hqrrrDO)263E)F)FD& 2&&:PQQ'01C'D'D$$$r%   r   r^   r   r   TF	use_cacher   r   c           
      `   |                      |||||||	|
          \  }}}|                     ||z             }d }|6|                     ||||||          \  }}|                     ||z             }|                     |          }|                     ||z             }|f}|r||||fz  }|S )N)r   r^   r   r   r)  rU   rV   rR   )r   r   r   r   r^   r   )rN  rO  r]  r^  rQ  rR  )rn   r   r   re   encoder_attn_maskr   cross_attn_layer_head_maskr)  rU   rV   rR   r^   r`  r   r   ngram_attention_outputself_attn_weightsself_attn_weights_ngramcross_attn_weightsrT  rU  rV  s                         r$   r   zProphetNetDecoderLayer.forward  s   & NR^^'+)+,K+I.O% N\ 	N
 	N
J 13J 11-BX2XYY! ,37??+!60 : /"3 4C 4 400 !667G-7WXXM #//>>445H=5XYY " 	X)+BDVWWGr%   r   )NNNNNNNNNNTFN)rq   rr   rs   rt   r   r   r   r   r   r    r   r   r   r   s   @r$   rY  rY    s         E E/ E E E E E E _%0A6RRR "#'(,'+*.$(,1154 4 D>4 $D>4 !.4 4 4 SR4 4 4 4 4r%   rY  z=
    The standalone encoder part of the ProphetNetModel.
    c                        e Zd Zddedeej                 f fdZd Zd Z	e
	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
eej                 dee         dee         dee         deeef         fd            Z xZS )ProphetNetEncoderNr   word_embeddingsc                    t                                                     ||n%t          j        j        j        j                  | _        t                    | _	        t          j                  | _        t          j        fdt          j                  D                       | _        d| _        |                                  dS )7  
        word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
            The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
            embeddings instead of randomly initialized word embeddings.
        Nr   c                 .    g | ]}t                    S ry   )rK  )r  _r   s     r$   r  z.ProphetNetEncoder.__init__.<locals>.<listcomp>
  s"    $n$n$n%;F%C%C$n$n$nr%   F)r   r   r   r   
vocab_sizer   r   rj  r   position_embeddingsr   embeddings_layer_norm
ModuleListr/   num_encoder_layersr   gradient_checkpointing	post_initrn   r   rj  r   s    ` r$   r   zProphetNetEncoder.__init__  s     	    * Of/1CQWQdeee 	
 $B&#I#I %.v/A%B%B"m$n$n$n$nUSYSlMmMm$n$n$noo&+#r%   c                     | j         S r   rj  rm   s    r$   get_input_embeddingsz&ProphetNetEncoder.get_input_embeddings      ##r%   c                     || _         d S r   ry  rn   values     r$   set_input_embeddingsz&ProphetNetEncoder.set_input_embeddings      $r%   r   r   	head_maskinputs_embedsr   output_hidden_statesreturn_dictr   c                    ||n| j         j        }||n| j         j        }||n| j         j        }||t	          d          ||t	          d          |||                     |          }|md|ddddddf                             d| j         j        dd          z
  t          j	        | j
                  j        z  }|                    |j
                  }nd}|                     |j        dd         |j                  \  }	}
||	z   }|                     |          }t"          j                            || j         j        | j                  }|rdnd}|rdnd}|k|                                d	         t-          | j                  k    s;J d
t-          | j                   d|                                d	          d            t1          | j                  D ]<\  }}|r||fz   } ||||||         nd|          }|d	         }|r||d         fz   }=|r||fz   }|st3          d |||fD                       S t5          |||          S )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r)   r   ry   r   z&The head_mask should be specified for  layers, but it is for .)r   r   r   c              3      K   | ]}||V  	d S r   ry   r  vs     r$   	<genexpr>z,ProphetNetEncoder.forward.<locals>.<genexpr>h  s(      llq^_^k^k^k^k^kllr%   )r|   r   r   )r   r   r  use_return_dictr   rj  rP   rM  r    r+   r   r,   r(  rq  r   r'   rr  r   r   r   r   rQ   lenr   	enumeraterw   r   )rn   r   r   r  r  r   r  r  extended_attention_maskrq  rR   r   re   all_attentionsidxencoder_layerlayer_outputss                    r$   r   zProphetNetEncoder.forward  s	   4 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]!6RSSS"}'@QRRR"}'< 00;;M %nQQQdAAA%56==aAhjkmnoooDJ''+',# '>&@&@AT&U&U##&*#,0,D,D]EXY[Z[Y[E\^k^r,s,s)\%(;;22=AA--mt{?R]a]j-kk&: D0:d  >>##A&3t{+;+;<<<xT[9I9Ixxbkbpbpbrbrstbuxxx =<< #,DK"8"8 	F 	FC# Q(=@P(P%)M63<3H3d"3	  M *!,M  F!/=3C2E!E 	M$9]<L$L! 	mll]4I>$Zllllll+;P]k
 
 
 	
r%   r   )NNNNNNN)rq   rr   rs   r   r   r   r   r   rz  r  r   r    r   r   r   rw   r   r   r   r   s   @r$   ri  ri    sC        / (2<BX      ,$ $ $% % %  -115,004,0/3&*T
 T
EL)T
 !.T
 EL)	T

  -T
 $D>T
 'tnT
 d^T
 
uo%	&T
 T
 T
 ^T
 T
 T
 T
 T
r%   ri  z=
    The standalone decoder part of the ProphetNetModel.
    c                        e Zd Zddedeej                 f fdZd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         deej                 dee         dee         dee         dee         deej                 deeef         fd            Zd Zd Zd Z xZS )ProphetNetDecoderNr   rj  c                 j   t                                                     j        | _        j        | _        j        | _        j        | _        j        | _        ||n%t          j	        j
        j        j                  | _        t                    | _        t          j	        | j        j        d          | _        t          j        fdt%          j                  D                       | _        t+          j                  | _        d| _        |                                  dS )rl  Nrm  c                 2    g | ]}t          |           S )r[  )rY  )r  ir   s     r$   r  z.ProphetNetDecoder.__init__.<locals>.<listcomp>  s'    cccQ#Fa888cccr%   F)r   r   r4   rC   r  r   r   max_target_positionsr   r   rp  r   r   rj  r   rq  ngram_embeddingsrs  r/   num_decoder_layersr   r   rr  ru  rv  rw  s    ` r$   r   zProphetNetDecoder.__init__t  s    	   \
!-%+%A"~$*$B! * Of/1CQWQdeee 	
 $B&#I#I  "TZ9KT R Rmcccc%HaBbBbccc
 
 &/v/A%B%B"&+#r%   c                     | j         S r   ry  rm   s    r$   rz  z&ProphetNetDecoder.get_input_embeddings  r{  r%   c                     || _         d S r   ry  r}  s     r$   r  z&ProphetNetDecoder.set_input_embeddings  r  r%   r   r   re   encoder_attention_maskr  cross_attn_head_maskr^   r  r`  r   r  r  r   r   c                 
  %&' |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }||t          d          ||t          d          |||                     |          }|j        dd         \  %}| j        r%| j	        r|	rt                              d           d}	|	rO|M|6t          t          | j                   t          | j                             nt          | j                   }|	rCt          |t                    r.t                              d           t          j        |          }||                                nd	}|                     %|f|j        |
          \  }}|d	k    rd\  }}n|                     |          \  }}| j                            |dz             '||z   }| j        j        &|d	k    rJ|                    d          dk    s
J d            %&'fdt3          | j                  D             }d}d}nM&'fdt3          | j                  D             }|                     ||          }|                     ||          }|md|ddddddf                             d| j         j        dd          z
  t?          j         | j!                  j"        z  }|#                    |j!                  }nd}t?          j$        |g|z   d          }| j%        r| %                    |          }tL          j'        (                    || j(        | j	                  }|rdnd}|r| j         j        d	k    rdnd}|
rdnd}|
rdnd}|
r| j         j)        rdnd}tU          ||gddg          D ]u\  }}|n|                                d	         tW          | j,                  k    s>J d| dtW          | j,                   d|                                d	          d            vt[          | j,                  D ]\  } }!|r4||ddd|f         fz  }| j         j        d	k    r||dd|df         fz  } |!|||||||          nd|||          nd||||||	|
|          }"|"d	         }|
r0||"d         fz  }||"d         fz  }| j         j)        r||"d         fz  }|r4||ddd|f         fz  }| j         j        d	k    r||dd|df         fz  }|ddd|f         }#| j         j        d	k    r|dd|df         nd}$|s!t          d |#|$||||||fD                       S t]          |#|$||||||          S )aY  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r)   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r'   r^   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1c                 V    g | ]%}|d z
           z                        d d           &S r   )rP   )r  r4   r   r  predicting_stream_pos_embeds     r$   r  z-ProphetNetDecoder.forward.<locals>.<listcomp>  sM     # # # "%!),/JJRRS]_`bcdd# # #r%   c                 ,    g | ]}|d z
           z   S r  ry   )r  r4   r  r  s     r$   r  z-ProphetNetDecoder.forward.<locals>.<listcomp>  s6     # # #PU!%!),/JJ# # #r%   r  r   ry   r  r  zThe `z` should be specified for r  r  )rb  r   rc  r)  rU   rV   rR   r^   r`  r   r   r	   c              3      K   | ]}||V  	d S r   ry   r  s     r$   r  z,ProphetNetDecoder.forward.<locals>.<genexpr>U  s4         =  !=== r%   )r|   r}   r^   r   r   r   r   rc   )/r   r`  r   r  r  r   rj  r   ru  r   loggerwarning_oncer   r   r   rw   from_legacy_cacher   rq  r'   !compute_buffered_relative_bucketsr   r  r   rQ   r/   r4   prepare_attention_maskprepare_predict_attention_maskrP   r  r    r+   r   r,   r(  r2   rr  r   r   r   r\  zipr  r   r  r   )(rn   r   r   re   r  r  r  r^   r  r`  r   r  r  r   r3   past_key_values_lengthmain_stream_pos_embedrR   rU   rV   r   ngram_hidden_statesr  r)  extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attns	attn_mask	mask_namer  decoder_layerr  r|   r}   r   r  r  s(                                        @@@r$   r   zProphetNetDecoder.forward  s   J "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]!6fggg"}'@efff"}'< 00;;M&3&9"1"&=#
O& 	"4= 	" "##p   "	 	0 )4 $L$D$D$DlZ^ZeFfFfFfggg!555 
  	UOU;; 	U\  
 2COTTOETE`!?!?!A!A!Afg.2.F.F) '+ /G /
 /
+| "Q&&PZM*,M,M
 66|DD.1&*&>&G&GWXHX&Y&Y# &(==07 "Q&& %%a((A---a .--# # # # # #"4:..# # # '+#.2++# # # # #Y^_c_iYjYj# # # '+&A&A-Q_&`&`#.2.Q.QR_ao.p.p+ "-,QQQdAAA-=>EEaIprsuvwwwDJ''+/,+ /N.P.PQ^Qd.e.e++.2+	=/4G"GKK% 	F 66}EEM--mt|VZVc-dd /C(L%/C)gHY\]H]H]cg&&7 AT'8!Bd 1_dk6U_""[_ %(4H(IKYoKp$q$q 	 	 Iy$ ~~''*s4;/?/?@@@/I / /T[AQAQ / /!((+/ / / A@@ #,DK"8"8 	; 	;C# \--CSOCS@S2T1VV-;$q((2}QQQHXHXEX7Y6[[2)M'%"A3<3H3dI]Ii,@,E,Eos0O/M2S) /#"3-  M" *!,M  ;%-*:)<<%&=+;*==&;2 ;#a(8'::O 	X)mAAA?O?O<O.P-RR){ 1$$.=ODTDTAT3U2WW. *!!!-=o-=*=>HLHY\]H]H]-?3C3C0C"D"Dcg 	   &+#12)*#	      ,/$;+7 >,3,	
 	
 	
 		
r%   c           	         |j         \  }}t          j        d| j                                      |j                                      dd          }t          | j        | j	        |          \  }}|d d d |d |f                             |dd          }t          j
        |d d d |d |f         |d d d || j        | j        |z   f         gd                              |dd          }||fS r  )r   r    rE  r  r(  r'   rP   rW   rC   r  r2   )rn   rR   r   r3   main_relative_bucketspredict_relative_bucketss         r$   r  z3ProphetNetDecoder.compute_buffered_relative_bucketsn  s-   &2&8#
O|At'@AADD\EXYY``abdeff:]d8,;
 ;
77
 !6aaa9I/9IK[OK[6[ \ c cdnpqst u u#(9(,<_,<>N>N)NO(AA'')BTE^apEp)pp $
 $
 &Q
"
" 	! %&>>>r%   c                    |j         d d         \  }}t          j        ||ft          j        |j                  j        |j        |j                  }t          j        |d          }|d |d |f         d d d d d d f                             || j	        j
        f|j         z             }|8d|d d d d d d f         z
  t          j        | j                  j        z  }||z   }n|}|                    |j                  S )Nr)   r   r   r  )r   r    fullr+   r   r,   r'   triuexpandr   r  r(  )rn   r   r   r   
seq_lengthcausal_maskextended_causal_maskr  s           r$   r  z(ProphetNetDecoder.prepare_attention_mask  s0   !.!4RaR!8
J j$K+,,0% '	
 
 
 ja00*;J;+CDT4QRQRQRTUTUTUEUV]]@AKDUU 
  

 %'*^AAAtT111<L-M'MQVQ\]a]gQhQhQl&l#&:=T&T##&:#&))-*=>>>r%   c           	         |j         d d         \  }}t          | j        | j        |j        |j                  }t          j        |d d d |d |f         |d d d || j        | j        |z   f         gd          }|d d d d d d d d f                             || j	        j
        f|j         z             }|d|d d d d d d d f         z
  t          j        | j                  j        z  }|                    || j	        j
        | j        ||f          }t          j        |t          j        |          gd          }||z   }n|}|                    |j                  S )Nr)   rN   r   r  )r   r8   r  r4   r'   r   r    r2   r  r   r  r+   r,   r;   r(  )	rn   r   r   r   r  predict_causal_maskextended_predict_causal_maskr  r)  s	            r$   r  z0ProphetNetDecoder.prepare_predict_attention_mask  s   !.!4RaR!8
J 3%tz=3GI\
 
 $i#AAA{
{KZK$?@#AA{
{D$=@Y\f@f$ff 
 
 
 (;4qqq!!!QQQ;N'O'V'V@ADWD]](
 (
$
 %'*^AAAtT4QRQRQR<R-S'SW\WbcgcmWnWnWr&r#&=&D&DT[DdjR\^hi' '# ',i(%*:;R*S*STZ\' ' '# /KMd.d++.J+.11-2EFFFr%   r   NNNNNNNNNNNNN)rq   rr   rs   r   r   r   r   r   rz  r  r   r    r   r   r   r   rw   r   r   r  r  r  r   r   s   @r$   r  r  n  s        / (2<BX      >$ $ $% % %  -1158<9=,07;+/04$(,0/3&*15R
 R
EL)R
 !.R
  (5	R

 !) 6R
 EL)R
 'u|4R
 "%R
  -R
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
u22	3R
 R
 R
 ^R
h? ? ?,? ? ?0!G !G !G !G !G !G !Gr%   r  c            &           e Zd ZddgZdef fdZd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         deej                 deej                 dee         dee         dee         dee         deej                 deeef         f"d            Z xZS )ProphetNetModelencoder.word_embeddings.weightdecoder.word_embeddings.weightr   c                    t                                          |           t          j        |j        |j        |j                  | _        t          j	        |          }d|_
        d|_        t          || j                  | _        t          j	        |          }d|_        d|_        t          || j                  | _        |                                  d S )Nrm  FT)r   r   r   r   rp  r   r   rj  copydeepcopyr`  tie_encoder_decoderri  encoder
is_decoderr  decoderrv  )rn   r   encoder_configdecoder_configr   s       r$   r   zProphetNetModel.__init__  s       !|F,=v?Q_e_rsssv..#( -2*(9MNNv..$(!-2*(9MNN 	r%   c                     | j         S r   ry  rm   s    r$   rz  z$ProphetNetModel.get_input_embeddings  r{  r%   c                 X    || _         | j         | j        _         | j         | j        _         d S r   )rj  r  r  r}  s     r$   r  z$ProphetNetModel.set_input_embeddings  s*    $'+';$'+';$$$r%   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r   )r   tie_word_embeddings_tie_or_clone_weightsr  rj  r  rm   s    r$   _tie_weightszProphetNetModel._tie_weights  s^    ;* 	[&&t|'CTEYZZZ&&t|'CTEYZZZZZ	[ 	[r%   c                     | j         S r   )r  rm   s    r$   get_encoderzProphetNetModel.get_encoder  s
    |r%   Nr   r   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr  encoder_outputsr^   r  decoder_inputs_embedsr`  r   r  r  r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }||                     ||||
|||          }|                     |||d         ||||	||||||          }|s||z   S t          |j        |j	        |j
        |j        |j        |j        |j        |j        |j        |j        |j                  S )a7  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```N)r   r   r  r  r   r  r  r   )r   r   re   r  r  r  r^   r  r   r  r`  r  r   )r|   r}   r^   r_   r`   ra   rb   rc   rd   re   rf   )r   r`  r   r  r  r  r  r{   r|   r}   r^   r   r   r   r   rc   )rn   r   r   r  r  r  r  r  r  r^   r  r  r`  r   r  r  r   decoder_outputss                     r$   r   zProphetNetModel.forward  sD   t "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]""ll#-#+"3%9' +  O ,,'1"1!"4#1'!5+//!5#) ' 
 
   	5"_44+-?$3$K+;"1"?(7(K.9%4%E,=&5&G"1"?.9
 
 
 	
r%   )NNNNNNNNNNNNNNNN)rq   rr   rs   _tied_weights_keysr   r   rz  r  r  r  r   r   r    r   
BoolTensorrw   r   r   r   r{   r   r   r   s   @r$   r  r    s       :<\]/      "$ $ $< < <
[ [ [
    -11548=A,0487;+/+/048<$(,0/3&*15#j
 j
EL)j
 !.j
 $EL1	j

 !))9 :j
 EL)j
 $EL1j
 'u|4j
 "%j
 "%j
  -j
  (5j
 D>j
 $D>j
 'tnj
  d^!j
" !.#j
$ 
u22	3%j
 j
 j
 ^j
 j
 j
 j
 j
r%   r  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            (       :    e Zd Zg dZdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         de	e         de	e
j                 deeef         f$d            Zd dZde
j        fdZd Zd Z xZS )!"ProphetNetForConditionalGeneration)r  r  lm_head.weightr   c                    t                                          |           t          |          | _        |j        | _        |j        | _        t          j        |j	        |j
        d          | _        |                                  d S )NFr   )r   r   r  r   r   r   disable_ngram_lossr   r   r   rp  lm_headrv  r   s     r$   r   z+ProphetNetForConditionalGeneration.__init__[  sv       )&11!."(";y!3V5FUSSS 	r%   c                 l    | j         j        r'|                     | j        j        | j                   d S d S r   )r   r  r  r   rj  r  rm   s    r$   r  z/ProphetNetForConditionalGeneration._tie_weightsf  s?    ;* 	V&&t'FUUUUU	V 	Vr%   c                     | j         j        S r   )r   rj  rm   s    r$   rz  z7ProphetNetForConditionalGeneration.get_input_embeddingsj  s    ..r%   Nr   r   r  r  r  r  r  r  r^   r  r  labelsr`  r   r  r  r   r   c                 8   ||n| j         j        }||||                     |          }|                     |||||||||	|
||||||          }||j        n|j        dd         \  }}|d                             || j         j        |d          }|                     |          }|dddf         }| j         j        dk    r|ddddf         nd}|                                s|	                                }d}|| 
                    ||          }|s;t          d ||fD                       }||f|z   |dd         z   n||dd         z   S t          ||||j        |j        |j        |j        |j        |j        |j        |j        |j                  S )	a	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)r   r   r  r  r  r  r  r  r^   r  r  r`  r   r  r  r   r)   r   rN   r   c              3      K   | ]}||V  	d S r   ry   r  s     r$   r  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>  "      RRQAMqMMMMRRr%   )r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   )r   r  r   r   r   r   r4   r  is_contiguousr  _compute_lossrw   rZ   r^   r_   r`   ra   rb   rc   rd   re   rf   )rn   r   r   r  r  r  r  r  r  r^   r  r  r  r`  r   r  r  r   rV  r   r3   predicting_streamspredict_logitsr\   r]   r[   
all_logitss                              r$   r   z*ProphetNetForConditionalGeneration.forwardm  s    ~ &1%<kk$+B]"3";@U@] $ 1 1& 9 9//)/#9/!5++'"7/!5#)! " 
 
& (9'D##J_JefhghfhJi 	$
O %QZ__Z9JO]_``&8991%040AA0E0E~aaae,,4 ##%% 	)&&((F%%nf==D 	RR6<*@RRRRRJ9=9ID7Z''!""+55z\cdedfdf\gOgg,) ' 7&-&C,3,O#*#=)0)I!(!9*1*K&-&C#*#=   r%   r   c                    |                     | j        j        |                    d          |                    d                                        |          }t          | j        j                  D ]}|dk    r	| j        r n|||d d d d f<   |                    dd                                          }t          j
                            |                    d|                    d                    dt          j                  }t          j
                            ||                    d          d          }| j        j        dk    r|                    dd	           }|                    |                              d          }	||	         }|                                }| j        j        |                    d          z  }
d
| j        j        z
  |z  |
|z  z   }|S Nr   r   rN   r   r   )	reductionr   T)r   keepdimr  r   r   r4   rQ   fill_r/   r  r   r  r   r   log_softmaxr   r    r!   nll_lossepssumner   rn   r\   r  ignore_indexexpend_targetsr  lprobsr[   smooth_lossnon_masked_tokenseps_is              r$   r  z0ProphetNetForConditionalGeneration._compute_loss     ))$+*;V[[^^V[[YZ^^\\bbcoppt{()) 	- 	-A1uu0u&,N1aaa7##!!!Q''2244**KKFKKOO,,- + 
 
 }%%fn.A.A".E.EQW%XX;?S  !::"d:;;;K . 1 1, ? ? D DR H H%&78K%**,,KKOfkk"oo5E$+/)T1EK4GGDr%   c                 ,    |                      |          S r   )r   )rn   r  s     r$   %prepare_decoder_input_ids_from_labelszHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      (((r%   c                     | j         j        S r   )r   r  rm   s    r$   r  z.ProphetNetForConditionalGeneration.get_encoder      &&r%   c                     | j         j        S r   r   r  rm   s    r$   get_decoderz.ProphetNetForConditionalGeneration.get_decoder  r  r%   )NNNNNNNNNNNNNNNNNr   )rq   rr   rs   r  r   r   r  rz  r   r   r    r   r  r   r   r   rw   rZ   r   r  r  r  r  r   r   s   @r$   r  r  S  s\        poo	/ 	 	 	 	 	 	V V V/ / /  -11548=A,0487;26+/048<)-$(,0/3&*15%y yEL)y !.y $EL1	y
 !))9 :y EL)y $EL1y 'u|4y "%,/y "%y  -y  (5y &y D>y $D>y  'tn!y" d^#y$ !.%y& 
u//	0'y y y ^yv   8)EL ) ) ) )' ' '' ' ' ' ' ' 'r%   r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                        e Zd Zg dZdef fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 dee         dee         dee         dee         deeef         fd            ZddZ	 	 	 	 ddZ xZS )ProphetNetForCausalLM)z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightr  r   c                 Z   t          j        |          }d|_        d|_        t	                                          |           t          |          | _        |j        | _	        |j
        | _
        t          j        |j        |j        d          | _        |                                  d S )NTFr  )r  r  r  is_encoder_decoderr   r   ProphetNetDecoderWrapperr   r   r   r  r   r   r   rp  r  rv  r   s     r$   r   zProphetNetForCausalLM.__init__  s    v&& $)!   26::!."(";y!3V5FUSSS 	r%   c                 $    | j         j        j        S r   r   r  rj  rm   s    r$   rz  z*ProphetNetForCausalLM.get_input_embeddings+  s    &66r%   c                 (    || j         j        _        d S r   r  r}  s     r$   r  z*ProphetNetForCausalLM.set_input_embeddings.  s    27///r%   c                 v    | j         j        r,|                     | j        j        j        | j                   d S d S r   )r   r  r  r   r  rj  r  rm   s    r$   r  z"ProphetNetForCausalLM._tie_weights1  sD    ;* 	^&&t'>'NPTP\]]]]]	^ 	^r%   c                     || j         _        d S r   r  )rn   r  s     r$   set_decoderz!ProphetNetForCausalLM.set_decoder5  s    ")r%   c                     | j         j        S r   r  rm   s    r$   r  z!ProphetNetForCausalLM.get_decoder8  r  r%   Nr   r   re   r  r  r  r^   r  r  r`  r   r  r  r   c                    ||n| j         j        }| j                            |||||||||
|||          }||j        n|j        dd         \  }}|d                             || j         j        |d          }|                     |          }|dddf         }| j         j        dk    r|ddddf         nd}d}|	|                     ||	          }|s;t          d ||fD                       }||f|z   |dd         z   n||dd         z   S t          ||||j        |j        |j        |j        |j        |j        	  	        S )	a	  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)r   r   re   r  r  r  r^   r  r`  r   r  r  r)   r   rN   r   c              3      K   | ]}||V  	d S r   ry   r  s     r$   r  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>  r  r%   )	r[   r\   r]   r^   r   r   r   r   rc   )r   r  r   r  r   r   r4   r  r  rw   r   r^   r   r   r   r   rc   )rn   r   r   re   r  r  r  r^   r  r  r`  r   r  r  rV  r   r3   r  r  r\   r]   r[   r  s                          r$   r   zProphetNetForCausalLM.forward;  s   B &1%<kk$+B] /)))"7#9!5+'/!5# * 
 
 :C9NiooTaTghjijhjTk#
O$QZ__Z9JO]_``&8991%040AA0E0E~aaae,,4%%nf==D 	RR6<*@RRRRRJ9=9ID7Z''!""+55z\cdedfdf\gOgg,) ' 7%3$+$?"-!(!9!(!9
 
 
 
r%   r   c                    |                     | j        j        |                    d          |                    d                                        |          }t          | j        j                  D ]}|dk    r	| j        r n|||d d d d f<   |                    dd                                          }t          j
                            |                    d|                    d                    dt          j                  }t          j
                            ||                    d          d          }| j        j        dk    r|                    dd	           }|                    |                              d          }	||	         }|                                }| j        j        |                    d          z  }
d
| j        j        z
  |z  |
|z  z   }|S r  r  r  s              r$   r  z#ProphetNetForCausalLM._compute_loss  r
  r%   c                    ||                     |j                  }|&|                                dk    r|d d dd f         }|||||d}|                    dd            |                                D ]\  }}	||vr|	||<   |S )Nr   rN   )r   r   r  r^   r`  r   )new_onesr   r   popitems)
rn   r   r^   r   r  r`  kwargsmodel_inputsr  r~  s
             r$   prepare_inputs_for_generationz3ProphetNetForCausalLM.prepare_inputs_for_generation  s     !&//	@@N&?+I+I+K+Ka+O+O!!!!RSS&)I #,"."
 
 	

#T*** !,,.. 	* 	*JC,&&$)S!r%   r  r  )NNNN)rq   rr   rs   r  r   r   rz  r  r  r  r  r   r   r    r   r   r   r   rw   r   r   r  r(  r   r   s   @r$   r  r    s         /       7 7 78 8 8^ ^ ^* * *' ' '  -1158<9=,07;+/04)-$(,0/3&*l lEL)l !.l  (5	l
 !) 6l EL)l 'u|4l "%l  -l &l D>l $D>l 'tnl d^l 
u//	0l l l ^l\   > " " " " " " " "r%   r  c                   4     e Zd ZdZdef fdZd Zd Z xZS )r  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r   c                     t                                          |           t          j        |j        |j        |j                  | _        t          || j                  | _	        | 
                                 d S )Nrm  ry  )r   r   r   r   rp  r   r   rj  r  r  rv  r   s     r$   r   z!ProphetNetDecoderWrapper.__init__  sm       !|F,=v?Q_e_rsss(AUVVV 	r%   c                 j    |                      | j        | j                                                   d S r   )r  rj  r  rz  rm   s    r$   r  z%ProphetNetDecoderWrapper._tie_weights  s/    ""4#79Z9Z9\9\]]]]]r%   c                      | j         |i |S r   )r  )rn   argsr&  s      r$   r   z ProphetNetDecoderWrapper.forward  s    t|T,V,,,r%   )	rq   rr   rs   rt   r   r   r  r   r   r   s   @r$   r  r    sp         
/      ^ ^ ^- - - - - - -r%   r  )r  ri  r  r  r  r   rW  )?rt   r  r@   rj   dataclassesr   typingr   r   r    r   r   torch.nnr   activationsr
   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.deprecationr   configuration_prophetnetr   
get_loggerrq   r  r   r8   rL   rW   rZ   r{   r   r   r   r   r   Moduler   r   r
  rK  rY  ri  r  r  r  r  r  __all__ry   r%   r$   <module>r=     sc   Y X    ! ! ! ! ! ! " " " " " " " "                ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) 9 9 9 9 9 9 / / / / / / - - - - - - 9 9 9 9 9 9 9 9 9 9 0 0 0 0 0 0 6 6 6 6 6 6 
	H	%	%Q Q Q Q7 7 7"       6M M M.   
3% 3% 3% 3% 3% 3% 3%  3%l   1% 1% 1% 1% 1%; 1% 1%  1%h   
#@ #@ #@ #@ #@; #@ #@  #@L   
+@ +@ +@ +@ +@ +@ +@  +@\ #! #! #! #! #! #! #! #!L(- (- (- (- (-R\ (- (- (-VD2 D2 D2 D2 D2") D2 D2 D2N    BI   .@/ @/ @/ @/ @/29 @/ @/ @/F
( ( ( ( (7 ( ( (VI I I I I7 I I IX   
r
 r
 r
 r
 r
1 r
 r
 
r
j   
JG JG JG JG JG1 JG JG 
JGZ
 O
 O
 O
 O
 O
/ O
 O
 O
d   
t' t' t' t' t')BO t' t' 
t'n   
T T T T T5 T T 
Tn- - - - -8 - - -,  r%   