
     `iy                     L   d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%  e            rddl&m'Z'm(Z(  G d de          Z)e G d de                      Z* G d de          Z+ G d de          Z,e G d de*                      Z- ed           G d  d!e*e                      Z. G d" d#e          Z/ G d$ d%e!          Z0 G d& d'e          Z1g d(Z2dS ))zPyTorch PLBART model.    N)OptionalUnion)nn)CrossEntropyLoss   )Cache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_available   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfig)	BlockMaskmake_flex_block_causal_maskc                       e Zd ZdS )PLBartScaledWordEmbeddingN__name__
__module____qualname__     }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/plbart/modular_plbart.pyr    r    6           Dr&   r    c                   j   e Zd ZU eed<   dZdZddgZdZdZ	dZ
deej        df         dej        fd	Zdeeej        d
f                  dej        dej        defdZedej        dededej        dej        defd            Zdeej        df         deej        df         dej        dej        fdZdS )PLBartPreTrainedModelconfigmodelTPLBartDecoderLayerPLBartEncoderLayerattention_maskNinputs_embedsc                 *   || j         j        dk    r	d|v r|nd }nw| j         j        dk    rt          ||j                  }nQ| j         j        dk    r,t	          |t
          j                  rt          |d          }nt          ||j                  }|S )Nflash_attention_2r   sdpaflex_attentionF)	is_causal	r+   _attn_implementationr   dtype
isinstancetorchTensorr   r   )selfr/   r0   s      r'   _update_full_maskz'PLBartPreTrainedModel._update_full_maskE   s    
 %{/3FFF343F3FD1V;; "E^UbUh!i!i15EEEnel;; b%@[`%a%a%aN "<NML_!`!`r&   r   input_tensorcache_positionpast_key_valuesc           	         | j         j        dk    rnt          |t          j                  rt          |          }nB|@t          t          j        |j        d         |j        d         f|j                            }|S | j         j        dk    r||dk    	                                r|S d S ||
                                nd}||j        nd}| j         j        dk    r!|st          j        |||| j        	          rd S |j        }|j        d         }|r|                                }	n/t          |t          j                  r|j        d
         n||z   dz   }	|                     |||	|||j        d                   }
| j         j        dk    r>|<|j        j        dv r.t          j        |          j        }t          j        |
|          }
|
S )Nr4   r   r   )sizedevicer2   g        Fr3   )r0   past_key_values_lengthis_training)sequence_lengthtarget_lengthr8   r?   
batch_size)cudaxpunpu)r+   r7   r9   r:   r;   r   onesshaperC   anyget_seq_lengthis_compileabler
   _ignore_causal_mask_sdpatrainingr8   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r<   r/   r>   r?   r@   past_seen_tokensusing_compilable_cacher8   rG   rH   causal_mask	min_dtypes               r'   _update_causal_maskz)PLBartPreTrainedModel._update_causal_mask\   s0    ;+/???.%,77 
!<^!L!L  '!<J*03\5G5JK-4  " " "!;+/BBB)~/D.I.I.K.K)%%4
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD
 E**.I0CKQZ[[Kr&   rG   rH   r8   rI   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer8   rC   r   )diagonalrC   rF   r   )dimr:   rW   rX   fullrC   triuarangereshapeexpandclonerN   tomasked_fill)r/   rG   rH   r8   r?   rI   kwargsr\   r]   mask_lengthpadding_masks              r'   rU   zKPLBartPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position   s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r&   encoder_hidden_statesencoder_attention_maskinput_shapec                 \   ||| j         j        dk    r	d|v r|nd }n| j         j        dk    rt          ||j        |d                   }n`| j         j        dk    r3t	          |t
          j                  rt          ||d         d          }nt          ||j        |d                   }|S )	Nr2   r   r3   rF   )tgt_lenr4   F)query_lengthr5   r6   )r<   rp   rq   rr   r0   s        r'   _update_cross_attn_maskz-PLBartPreTrainedModel._update_cross_attn_mask   s     !,1G1S{/3FFFCDH^C^C^)?)?dh&&1V;; *M*!''O* * *&&
 15EEE4elCC -H.%0_"'. . .* *D*M,?UW* * *& &%r&   )r"   r#   r$   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   r:   r;   r=   r   r   r^   staticmethodintr8   rU   Sizerv   r%   r&   r'   r*   r*   :   s        &*#-/CDNelD01 |   .J u|['@!ABJ lJ 	J
 J J J JX 444 4 {	4
 4 4 4 4 \4n!&$U\4%78!& !&elD&8 9!& Z	!&
 |!& !& !& !& !& !&r&   r*   c                       e Zd ZdS )PLBartEncoderNr!   r%   r&   r'   r   r     r(   r&   r   c                       e Zd ZdS )PLBartDecoderNr!   r%   r&   r'   r   r   	  r(   r&   r   c            &       "    e Zd ZddgZdef fdZd Zd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deeej                          dee         deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f"d            Z xZS )PLBartModelencoder.embed_tokens.weightdecoder.embed_tokens.weightr+   c                 t   t                                          |           |j        |j        }}|j        rt          j        |j                  nd}t          ||j        ||          | _	        t          || j	                  | _        t          || j	                  | _        |                                  d S )Ng      ?)embed_scale)super__init__pad_token_id
vocab_sizescale_embeddingmathsqrtd_modelr    sharedr   encoderr   decoderinit_weights)r<   r+   padding_idxr   r   	__class__s        r'   r   zPLBartModel.__init__  s       "("5v7HZ393IRdi///s/
FNKepqqq$VT[99$VT[99r&   c                     | j         S N)r   r<   s    r'   get_input_embeddingsz PLBartModel.get_input_embeddings  s
    {r&   c                 X    || _         | j         | j        _        | j         | j        _        d S r   )r   r   embed_tokensr   )r<   values     r'   set_input_embeddingsz PLBartModel.set_input_embeddings   s'    $(K!$(K!!!r&   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r   )r+   tie_word_embeddings_tie_or_clone_weightsr   r   r   r   r   s    r'   _tie_weightszPLBartModel._tie_weights%  s\    ;* 	O&&t|'@$+NNN&&t|'@$+NNNNN	O 	Or&   c                     | j         S r   )r   r   s    r'   get_encoderzPLBartModel.get_encoder*  s
    |r&   N	input_idsr/   decoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputsr@   r0   decoder_inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictr?   returnc                    ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }||t          || j         j                  }||                     ||||
|||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|                     |||d         ||||	||||||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j                  S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        N)r   r/   r   r0   r   r   r   r   r   r   )last_hidden_statehidden_states
attentions)r   r/   rp   rq   r   r   r@   r0   r   r   r   r   r?   )r   r@   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterp   encoder_attentions)r+   r   r   r   use_return_dictr   r   r   r9   r   lenr   r   r   r@   r   r   r   )r<   r   r/   r   r   r   r   r   r   r@   r0   r   r   r   r   r   r?   decoder_outputss                     r'   forwardzPLBartModel.forward-  s   b 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] $)>)F 29dk>V W W""ll#-#+"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O ,,'1"1!"4#1'!5+//!5#) ' 
 
   	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r&   )NNNNNNNNNNNNNNNN)r"   r#   r$   _tied_weights_keysr   r   r   r   r   r   r   r   r:   
LongTensorr;   listFloatTensorr   boolr   tupler   r   __classcell__r   s   @r'   r   r     s(       79VW
| 
 
 
 
 
 
  0 0 0
O O O
    15598<9=,08<7;=A+/59=A$(,0/3&*59#k
 k
E,-k
 !!12k
 $E$45	k

 !) 6k
 EL)k
 $E$45k
 'u|4k
 "$u'8"9:k
 "%k
   12k
  ((9:k
 D>k
 $D>k
 'tnk
  d^!k
" !!12#k
$ 
uU\"$66	7%k
 k
 k
 ^k
 k
 k
 k
 k
r&   r   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc            (           e Zd ZdZdgZg dZdef fdZd Zd Z		 d#d
e
dee
         dedej        f fdZd
e
ddfdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeej                          dee         deej                 deej                 deej                 dee         dee         dee         dee         d eej                 deeej                 ef         f$d!            Zdej        fd"Z xZS )%PLBartForConditionalGenerationr,   final_logits_bias)r   r   zlm_head.weightr+   c                 l   t                                          |           t          |          | _        |                     dt          j        d| j        j        j        f                     t          j
        |j        | j        j        j        d          | _        |                                  d S )Nr   r   F)bias)r   r   r   r,   register_bufferr:   zerosr   num_embeddingsr   Linearr   lm_headr   )r<   r+   r   s     r'   r   z'PLBartForConditionalGeneration.__init__  s        ((
0%+q$*BSBb>c2d2deeey1B1QX]^^^r&   c                 4    | j                                         S r   )r,   r   r   s    r'   r   z*PLBartForConditionalGeneration.get_encoder      z%%'''r&   c                 4    | j                                         S r   )r,   get_decoderr   s    r'   r   z*PLBartForConditionalGeneration.get_decoder  r   r&   NTnew_num_tokenspad_to_multiple_ofmean_resizingr   c                     t                                          |||          }|                     |j        j        d                    |S )Nr   )r   resize_token_embeddings_resize_final_logits_biasweightrN   )r<   r   r   r   new_embeddingsr   s        r'   r   z6PLBartForConditionalGeneration.resize_token_embeddings  sG     88I[]jkk&&~'<'B1'EFFFr&   c                    | j         j        d         }||k    r| j         d d d |f         }nBt          j        d||z
  f| j         j                  }t          j        | j         |gd          }|                     d|           d S )NrF   r   rc   )rd   r   )r   rN   r:   r   rC   catr   )r<   r   old_num_tokensnew_bias
extra_biass        r'   r   z8PLBartForConditionalGeneration._resize_final_logits_bias  s    /5b9^++-aaa..@AHHa.)H%IRVRhRopppJy$"8*!E1MMMH0(;;;;;r&   r   r/   r   r   r   r   r   r   r@   r0   r   labelsr   r   r   r   r?   c                 n   ||n| j         j        }|||t          || j         j                  }|                     |||||||||	|
||||||          }|                     |d                   }|| j                            |j                  z   }d}|Kt                      } ||
                    d| j         j                  |
                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Mask-filling:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

        >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

        >>> # en_XX is the language symbol id <LID> for English
        >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

        >>> logits = model(input_ids).logits
        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['first', 'same', 'highest', 'result', 'number']
        ```
        N)r/   r   r   r   r   r   r   r@   r0   r   r   r   r   r   r?   r   rF   r   )	losslogitsr@   r   r   r   r   rp   r   )r+   r   r   r   r,   r   r   rk   rC   r   viewr   r   r@   r   r   r   r   rp   r   )r<   r   r/   r   r   r   r   r   r   r@   r0   r   r   r   r   r   r   r?   outputs	lm_logitsmasked_lm_lossloss_fctoutputs                          r'   r   z&PLBartForConditionalGeneration.forward  s   V &1%<kk$+B] (-B-J$6vt{?W$X$X!**)/+#9/!5+'"7/!5#)!  
 
$ LL,,	 6 9 9):J K KK	'))H%XinnR9O&P&PRXR]R]^`RaRabbN 	Z\GABBK/F3A3M^%..SYY#3")"?&9$5&-&G")"?&9

 

 

 
	
r&   c                 6    t          || j        j                  S r   )r   r+   r   )r<   r   s     r'   %prepare_decoder_input_ids_from_labelszDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labels?  s    !&$+*BCCCr&   )NT)NNNNNNNNNNNNNNNNN)r"   r#   r$   rx   _keys_to_ignore_on_load_missingr   r   r   r   r   r   r   r   r   	Embeddingr   r   r   r:   r   r;   r   r   r   r   r   r   r   r   r   r   s   @r'   r   r     s         ':&;#iii|      ( ( (( ( ( dh !7?}\`	     < < < < < <  15598<9=,08<7;=A+/59=A)-$(,0/3&*59%x
 x
E,-x
 !!12x
 $E$45	x

 !) 6x
 EL)x
 $E$45x
 'u|4x
 "$u'8"9:x
 "%x
   12x
  ((9:x
 &x
 D>x
 $D>x
  'tn!x
" d^#x
$ !!12%x
& 
uU\"O3	4'x
 x
 x
 ^x
tDEL D D D D D D D Dr&   r   c                       e Zd ZdS )PLBartClassificationHeadNr!   r%   r&   r'   r   r   C  r(   r&   r   c                        e Zd Z fdZ xZS )PLBartForSequenceClassificationc                  :     t                      j        di |  dS )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr%   r   r   super_kwargsr   s    r'   r   z'PLBartForSequenceClassification.forwardH  s(    B 	'','''''r&   )r"   r#   r$   r   r   r   s   @r'   r   r   G  s8        !( !( !( !( !( !( !( !( !(r&   r   c                   .     e Zd Ze fd            Z xZS )PLBartForCausalLMc                  :     t                      j        di |  dS )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
        >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base", add_cross_attention=False)
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```Nr%   r   r   s    r'   r   zPLBartForCausalLM.forwardm  s'    : 	'','''''r&   )r"   r#   r$   r   r   r   r   s   @r'   r   r   l  sB        ( ( ( ( ^( ( ( ( (r&   r   )r   r   r   r   r*   )3__doc__r   typingr   r   r:   r   torch.nnr   cache_utilsr   
generationr	   modeling_attn_mask_utilsr
   r   r   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   bart.modeling_bartr   r   r   r   r   (bigbird_pegasus.modeling_bigbird_pegasusr   mbart.modeling_mbartr   configuration_plbartr   integrations.flex_attentionr   r   r    r*   r   r   r   r   r   r   r   __all__r%   r&   r'   <module>r     s      " " " " " " " "        % % % % % %             ) ) ) ) ) )         
         
 . - - - - - A A A A A A A A              _ ^ ^ ^ ^ ^ 5 5 5 5 5 5 . . . . . .  !! VUUUUUUUU	 	 	 	 	 7 	 	 	 G& G& G& G& G&O G& G& G&T	 	 	 	 	K 	 	 		 	 	 	 	K 	 	 	 K
 K
 K
 K
 K
' K
 K
 K
\   
_D _D _D _D _D%:O _D _D 
_DD	 	 	 	 	5 	 	 	"( "( "( "( "(&M "( "( "(J( ( ( ( ( ( ( (B  r&   