
     `i7c                       d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-  e&            rddl.m/Z/ ddl0m1Z1  e)j2        e3          Z4 G d dej5                  Z6 G d dej5                  Z7 G d dej5                  Z8 G d dej5                  Z9 G d dej5                  Z: G d dej5                  Z; G d  d!ej5                  Z< G d" d#e          Z= G d$ d%ej5                  Z>e% G d& d'e!                      Z? G d( d)e?          Z@e% G d* d+e?                      ZA e%d,-           G d. d/e?e                      ZBe% G d0 d1e?                      ZC e%d2-           G d3 d4e?                      ZDe% G d5 d6e?                      ZEe% G d7 d8e?                      ZFg d9ZGdS ):zPyTorch UMT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )UMT5LayerNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/umt5/modeling_umt5.pyr*   zUMT5LayerNorm.__init__?   sD     	l5:k#:#:;; #    c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )N   T)keepdim)tor,   float32powmeanrsqrtr/   r.   dtypefloat16bfloat16)r0   hidden_statesvariances      r4   forwardzUMT5LayerNorm.forwardG   s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r5   )r'   )__name__
__module____qualname__r*   rD   __classcell__r3   s   @r4   r&   r&   >   sL        $ $ $ $ $ $+ + + + + + +r5   r&   c                   *     e Zd Zdef fdZd Z xZS )UMT5DenseActDenseconfigc                 J   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j	                  | _
        t          |j                 | _        d S NFbias)r)   r*   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr
   dense_act_fnactr0   rL   r3   s     r4   r*   zUMT5DenseActDense.__init__Y   sx    )FNFKeDDD)FKeDDDz&"566&-.r5   c                    |                      |          }|                     |          }|                     |          }t          | j        j        t          j                  r]|j        | j        j        j        k    rC| j        j        j        t          j	        k    r$|
                    | j        j        j                  }|                     |          }|S N)rT   rZ   rX   
isinstancerU   r.   r,   Tensorr?   int8r:   r0   rB   s     r4   rD   zUMT5DenseActDense.forward`   s    ..//]33tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r5   rE   rF   rG   r"   r*   rD   rH   rI   s   @r4   rK   rK   X   sS        /z / / / / / /      r5   rK   c                   *     e Zd Zdef fdZd Z xZS )UMT5DenseGatedActDenserL   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S rN   )r)   r*   r   rQ   rR   rS   wi_0wi_1rU   rV   rW   rX   r
   rY   rZ   r[   s     r4   r*   zUMT5DenseGatedActDense.__init__p   s    IfnfkFFF	IfnfkFFF	)FKeDDDz&"566&-.r5   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S r]   )rZ   rf   rg   rX   r^   rU   r.   r,   r_   r?   r`   r:   )r0   rB   hidden_geluhidden_linears       r4   rD   zUMT5DenseGatedActDense.forwardx   s    hhtyy7788		-00#m3]33 tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r5   rb   rI   s   @r4   rd   rd   o   sS        /z / / / / / /      r5   rd   c                   *     e Zd Zdef fdZd Z xZS )UMT5LayerFFrL   c                 $   t                                                       |j        rt          |          | _        nt          |          | _        t          |j        |j                  | _	        t          j        |j                  | _        d S )Nr2   )r)   r*   is_gated_actrd   DenseReluDenserK   r&   rR   layer_norm_epsilon
layer_normr   rV   rW   rX   r[   s     r4   r*   zUMT5LayerFF.__init__   sx     	<"8"@"@D"3F";";D'F<UVVVz&"566r5   c                     |                      |          }|                     |          }||                     |          z   }|S r]   )rr   rp   rX   )r0   rB   forwarded_statess      r4   rD   zUMT5LayerFF.forward   sF    ??=99../?@@%5E(F(FFr5   rb   rI   s   @r4   rl   rl      sS        7z 7 7 7 7 7 7      r5   rl   c                   *    e Zd ZdZddee         f fdZdej        dej        fdZ	d	 Z
dd
Z eddd          	 	 	 	 	 ddej        deej                 dee         deej                 deej                 deej                 fd            Z xZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    FN	layer_idxc                 B   t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j
        | j        z  | _        || _        |/| j        r(t                              d| j        j         d           t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        | j        r$t'          j        | j        | j
                  | _        t7                      | _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrO   )r)   r*   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerR   d_kvkey_value_proj_dim	num_headsn_headsrW   rX   	inner_dimrw   loggerwarning_oncer3   rE   r   rQ   qkvo	Embeddingrelative_attention_biassetpruned_heads)r0   rL   rz   rw   r3   s       r4   r*   zUMT5Attention.__init__   sk    ++F(.4.S+/5/U,~"(+'*(??",4>+B , , ,   4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EEr5   
projectionreturnc                     |                                 d d         | j        | j        fz   }|                    |                              dddd          }|S )Nr8   r   r7   r!   r	   )sizer   r~   viewpermute)r0   r   new_projection_shapenew_projections       r4   _shapezUMT5Attention._shape   sW    )00"5tG^8__#)=>>FFq!QPQRRr5   c                 ~   d}| j         }| j        }| j        sC|dz  }||dk                        t          j                  |z  z  }t	          j        |          }n(t	          j        |t	          j        |                     }|dz  }||k     }t	          j	        |
                                |z            t          j	        ||z            z  }|||z
  z  }||                    t          j                  z   }t	          j        |t	          j        ||dz
                      }|t	          j        |||          z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r7   r!   )r{   r|   ry   r:   r,   longabsmin
zeros_likelogfloatmath	full_likewhere)	r0   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larges	            r4   _relative_position_bucketz'UMT5Attention._relative_position_bucket   sP   * 9; 	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 I/5577)CDDtxP\_hPhGiGii	y!89	%.ej1I1I%I"%*Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r5   c                    || j         j        j        }|,t          j        |t          j        |          dddf         }n|dddf         }t          j        |t          j        |          dddf         }||z
  }|                     |          }|                      |          }	|	                    g d                              d          }	|	S )z%Compute binned relative position biasN)r?   device)r7   r   r!   r   )	r   r.   r   r,   aranger   r   r   	unsqueeze)
r0   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r4   compute_biaszUMT5Attention.compute_bias   s    >18?F!$|L
SYZZZ[\[\[\^b[bc-aaag6,zFSSSTXZ[Z[Z[T[\+.>>#'#A#ABS#T#T --.FGG			**44Q77r5   past_key_valuepast_key_values4.58new_nameversionrB   encoder_hidden_statesattention_masklayer_head_maskr   c                    |j         d d         \  }}|d u}	|                     |          }
|
                    |d| j        | j                                      dd          }
d}|Ft          |t                    r1|j        	                    | j
                  }|	r|j        }n
|j        }n|}|	r|n|}|	r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|N|	s|nd }|                    ||| j
        d|i          \  }}|	r$t          |t                    rd|j        | j
        <   t'          j        |
|                    dd                    }|||                                z   n|}|j         d         }| j        s+t'          j        d| j        ||f|j        |j        	          }n3|                     |||j        |
          }|d d d d | d d d f         }|$|d d d d d d d |j         d         f         }||z   }| j        rUt'          j        |j         d                   }d|t;          | j                  <   |d d |                                f         }n|}||z  }t>          j         !                    |"                                d          #                    |          }t>          j         $                    || j$        | j%                  }|||z  }t'          j        ||          }|                    dd          &                                }|                    ||d          }| '                    |          }||fS )Nr7   r8   r!   Fr   Tr	   )r   r?   )r   r   r   dim)ptraining)(shaper   r   r   r~   	transposer^   r   
is_updatedgetrw   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater,   matmulget_seq_lengthrz   zerosr   r?   r   r   r-   listboolr   
functionalsoftmaxr   type_asrX   r   
contiguousr   )r0   rB   r   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskmaskposition_bias_maskedattn_weightsattn_outputs                            r4   rD   zUMT5Attention.forward  sH    "/!4RaR!8
J 3$>vvm,,#((RtG^__iijkmnoo 
&:oGZ+[+[&(377GGJ! K&5&K##&5&J##"12DW..- 	F/"=*"=,3DNCHJ.5dnELLL//J66.11L#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~> lJ,@,@A,F,FGG L[Kf*'E'E'G'GGGlv%b)
/ 	A!KDL*j9&-W]Wc  MM !--FMR` .  M *!!!QQQaaa*?@M%(AAAqqq2HJ4DR4H2H)HIK)K7M 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 && },,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>!&&z:rBBff[))L((r5   )FN)NNNNNNN)rE   rF   rG   __doc__r   intr*   r,   r_   r   r   r   r    r   rD   rH   rI   s   @r4   rv   rv      sX        " "XVY] " " " " " ": %,    -  -  - ^    _%0A6RRR 9=+/152615]) ])|])  (5]) "%	])
 !.]) "%,/]) !.]) ]) ]) SR]) ]) ]) ]) ])r5   rv   c                   j     e Zd Zd	dee         f fdZ eddd          	 	 	 	 d
d            Z xZS )UMT5LayerSelfAttentionNrw   c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NTrz   rw   rn   )r)   r*   rv   SelfAttentionr&   rR   rq   rr   r   rV   rW   rX   r0   rL   rw   r3   s      r4   r*   zUMT5LayerSelfAttention.__init__g  sb    *6t_hiii'F<UVVVz&"566r5   r   r   r   r   c                     |                      |          }|                     |||||          }||                     |d                   z   }|f|dd          z   }|S )Nr   r   r   r   r   r!   )rr   r   rX   )	r0   rB   r   r   r   r   normed_hidden_statesattention_outputoutputss	            r4   rD   zUMT5LayerSelfAttention.forwardm  sz      $}==-- )++) . 
 
 &5Ea5H(I(II "%5abb%99r5   r]   )NNNN	rE   rF   rG   r   r   r*   r    rD   rH   rI   s   @r4   r   r   f  s        7 7(3- 7 7 7 7 7 7 _%0A6RRR    SR    r5   r   c                   l     e Zd Zd	dee         f fdZ eddd          	 	 	 	 	 d
d            Z xZS )UMT5LayerCrossAttentionNrw   c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFr   rn   )r)   r*   rv   EncDecAttentionr&   rR   rq   rr   r   rV   rW   rX   r   s      r4   r*   z UMT5LayerCrossAttention.__init__  sc    ,VQVbklll'F<UVVVz&"566r5   r   r   r   r   c                     |                      |          }|                     ||||||          }||                     |d                   z   }	|	f|dd          z   }
|
S )Nr   r   r   r   r   r   r!   )rr   r   rX   )r0   rB   r   r   r   r   r   r   r   layer_outputr   s              r4   rD   zUMT5LayerCrossAttention.forward  s|      $}==// "7)++) 0 
 
 %t||4DQ4G'H'HH/$4QRR$88r5   r]   r   r   rI   s   @r4   r   r     s        7 7(3- 7 7 7 7 7 7 _%0A6RRR #   SR    r5   r   c                   t     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 	 	 dd	            Z xZS )	UMT5BlockNrw   c                    t                                                       |j        | _        t          j                    | _        | j                            t          ||                     | j        r)| j                            t          ||                     | j                            t          |                     d S )Nrw   )
r)   r*   ry   r   
ModuleListlayerappendr   r   rl   r   s      r4   r*   zUMT5Block.__init__  s     +]__

09MMMNNN? 	TJ5f	RRRSSS
+f--.....r5   r   r   r   r   Fc                     | j         d         |||||
          \  }}|j        t          j        k    rst          j        |j                  j        }t          j        t          j        |                                          |dz
  |          }t          j	        || |          }d }| j
        o|d u}|r | j         d         ||||||
          \  }}|j        t          j        k    rst          j        |j                  j        }t          j        t          j        |                                          |dz
  |          }t          j	        || |          } | j         d         |          }|j        t          j        k    rst          j        |j                  j        }t          j        t          j        |                                          |dz
  |          }t          j	        || |          }|f}|	r|||fz  }|S )Nr   r   i  )r   maxr!   r   r8   )r   r?   r,   r@   finfor  r   isinfanyclampry   )r0   rB   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   s                    r4   rD   zUMT5Block.forward  s    ,94:a=)++),
 ,
 ,
(( %-//M$788<I+ek-&@&@&D&D&F&F	TXHXZcddK!KK<[YYYM "!_R1Fd1R 	^0=
1&;5 : /-1 1 1-M- "em33!K(;<<@	#k%+m*D*D*H*H*J*JIX\L\^ghh %M|Q\ ] ] ] '
2}55 %-//M$788<I+ek-&@&@&D&D&F&F	TXHXZcddK!KK<[YYYM " 	?)+=>>Gr5   r]   )	NNNNNNFFNr   rI   s   @r4   r   r     s        / /(3- / / / / / / _%0A6RRR "##'; ; ; SR; ; ; ; ;r5   r   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )UMT5ClassificationHeadz-Head for sentence-level classification tasks.rL   c                    t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        t          j        |j        |j	                  | _
        d S )N)r   )r)   r*   r   rQ   rR   denserV   classifier_dropoutrX   
num_labelsout_projr[   s     r4   r*   zUMT5ClassificationHead.__init__  sc    Yv~v~>>
zF$=>>>	&.&2CDDr5   rB   r   c                     |                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S r]   )rX   r  r,   tanhr  ra   s     r4   rD   zUMT5ClassificationHead.forward  s[    ]33

=11
=11]33m44r5   )
rE   rF   rG   r   r"   r*   r,   r_   rD   rH   rI   s   @r4   r  r    sw        77Ez E E E E E EU\ el        r5   r  c                   T    e Zd ZU eed<   dZdZdZdgZdgZ	e
d             Zd Zd Zd	S )
UMT5PreTrainedModelrL   transformerTr   rU   c                 v    t          j        t                    }t          j        t                    }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r,   tensorr   r   )r0   r  
input_maskdummy_inputss       r4   r#  z UMT5PreTrainedModel.dummy_inputs	  s=    L..	\*--
!*"&0
 

 r5   c                    | j         j        }t          |t                    r$|j        j                            |dz             d	S t          |t          t          t          t          f          r|j        j        j                            d|dz             t          |d          r5| j         j        s)|j        j        j                            d|dz             t          |d          r[|j        j        j                            d|| j         j        dz  z             |j        j        j                                         d	S d	S t          |t(                    r`t          |d          rN|j        j        j                            d|dz             |j        j        j                                         d	S d	S t          |t,                    r|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         d	S d	S d	S t          |t2                    r|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         d	S d	S d	S t          |t:                    rt|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         d	S d	S d	S t          |t@                    r| j         j        }| j         j!        }| j         j"        }|j#        j        j                            d|||z  dz  z             |j$        j        j                            d||dz  z             |j%        j        j                            d||dz  z             |j&        j        j                            d|||z  dz  z             |j'        r0|j(        j        j                            d||dz  z             d	S d	S d	S )
zInitialize the weights      ?        )r=   stdlm_head
qa_outputs      
classifierrP   N))rL   initializer_factorr^   r&   r.   datafill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharednormal_hasattrtie_word_embeddingsr(  r)  rR   rP   zero_UMT5ForTokenClassificationr+  r  r  r  rK   rT   rU   rS   rd   rf   rg   rv   r}   r   r   r   r   r   rz   r   )r0   modulefactorrR   r~   r   s         r4   _init_weightsz!UMT5PreTrainedModel._init_weights  sY   /fm,, =	oM$$Vc\22222, (	
 
 ;	o M %--3FSL-IIIvy)) O$+2Q O%*22#2NNNv|,, 4!(-553Ft{ObgkNkDl5mmm!&+11333334 4  :;; *	ov|,, 4!(-553FSL5QQQ!&+11333334 4  677 &	oL$,,#6dkFY^bEb;c,dddv|V,, /1B1N!&,,...O"'//SfI\aeHe>f/gggv// 2FO4H4T$)//111112 24T4T 122 	o I!))s4;CV[_B_8`)aaavy&)) ,fin.H	#))+++I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H 677 	oK#++&T[EX]aDa:b+cccv{F++ .0@0L %++---K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H.. 	o k)G!%!1k+GHO ((cv'L^B^cgAg7h(iiiHO ((cv$7O(PPPHO ((cv$7O(PPPHO ((cv'L^B^cgAg7h(iii1 o.5:BBQW\chl[lQmBnnnnn	o 	oo or5   c                    | j         j        }| j         j        }|t          d          t	          |          rHt          j        |j        d d         dz   |          }t          j        ||dd df         gd          }nD|	                    |j                  }|dd df         
                                |ddd f<   ||d<   |t          d          |                    |d	k    |           |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r8   )r!   .r   r!   ).r   z1self.model.config.pad_token_id has to be defined.)rL   decoder_start_token_idpad_token_id
ValueErrorr   r,   fullr   cat	new_zerosclonemasked_fill_)r0   r  r>  r?  shifted_input_idss        r4   _shift_rightz UMT5PreTrainedModel._shift_rightV  s   !%!C{/!)6   Y'' 	? %
9?3B3+?$+FH^ _ _ %	+<iSbS>Q*RXZ [ [ [ ) 3 3IO D D)238)<)B)B)D)Dc122g&(>f%PQQQ&&'8D'@,OOO  r5   N)rE   rF   rG   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr#  r;  rG   r5   r4   r  r    s         %&*#!$!F  X@o @o @oD! ! ! ! !r5   r  c                        e Zd Zd fd	Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej        df         dej        d	ej        d
e	de
f
dZedej        dededej        d	ej        defd            Z xZS )	UMT5StackNc                    t                                                     || _        j        | _        t	          j        fdt          j                  D                       | _        t          j
        j                  | _        t	          j        j                  | _        d| _        |                                  d S )Nc                 2    g | ]}t          |           S )r   )r   ).0irL   s     r4   
<listcomp>z&UMT5Stack.__init__.<locals>.<listcomp>w  s&    #e#e#eqIf$B$B$B#e#e#er5   rn   F)r)   r*   embed_tokensry   r   r   range
num_layersblockr&   rR   rq   final_layer_normrV   rW   rX   gradient_checkpointing	post_init)r0   rL   rW  r3   s    ` r4   r*   zUMT5Stack.__init__s  s       ( +]#e#e#e#eERXRcLdLd#e#e#eff
 -fn&B[ \ \ \z&"566 ',#r5   c                     || _         d S r]   )rW  r0   new_embeddingss     r4   set_input_embeddingszUMT5Stack.set_input_embeddings  s    *r5   c                    |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|#|!| j        rdnd}t          d| d| d          |1|                                }|                    d|d                   }n@||                                d d         }n!| j        rdnd}t          d| d| d	          | j	        r%| j
        r|	rt                              d
           d}	|+| j        t          d          |                     |          }|\  }}|	du r| j        st          d|  d          | j        r]|	rZ|X| j         j        r7t          t!          | j                   t!          | j                             }nt!          | j                   }n	| j        sd }||                                nd}|t%          j        |||z   |j                  }|/t+                      s!||z   }t%          j        |||j                  }| j        r6|                     |||t1          |t                    r|j        n||
          }nT|P|d d d d d d f         }|                    |j                  }d|z
  t%          j        |j                  j        z  }nd }| j        rQ|O|                                \  }}}||f}|t%          j        ||j                  }|                     |          }nd }|                     || j         j                   }|                     || j         j                   }|rdnd }|
rdnd }|
r	| j        rdnd }| !                    |          }tE          | j#                  D ][\  }}||         } ||         }!|r||fz   } |||||| |!||	|
|
  
        }"|"d         }|
r||"d         fz  }| j        r||"d         fz  }\| $                    |          }| !                    |          }|r||fz   }|stK          d |||||fD                       S tM          |||||          S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer8   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)rL   r   r   )r?   r%  rO  )r  r   r	  r   r
  r  r   r!   r7   c              3      K   | ]}||V  	d S r]   rO  )rT  r   s     r4   	<genexpr>z$UMT5Stack.forward.<locals>.<genexpr>  s4       
 
 =  !===
 
r5   )last_hidden_stater   rB   
attentionscross_attentions)'rL   r
  r  output_hidden_statesuse_return_dictry   r@  r   r   r\  r   r   r   rW  is_encoder_decoderr   r   r   r,   r   r   r   r-   _update_causal_maskr^   r   r:   r?   r  r   invert_attention_maskget_head_maskrY  rX   	enumeraterZ  r[  tupler   )#r0   r  r   r   r  re  	head_maskcross_attn_head_maskr   r
  r  rl  return_dictr   err_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrB   rU  layer_moduler   r	  layer_outputss#                                      r4   rD   zUMT5Stack.forward  s     "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>+/?BZZNw>wwwww   "#..**K!r;r?;;II&',,..ss3KK+/?BZZNu>uuXfuuuvvv& 	"4= 	" "##p   "	  ( !_``` --i88M!,
J? j !hT!h!h!hiii ? 	# G_4;1 G&9$DK888,dk:Z:Z:Z' 'OO '3$+&F&F&FO 	# #OETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !*B*D*D!4zAO"Z
OML`aaaN? 	22o/BCC%44$! KK '(D$)9:K%..}/B.CCK,M<O0P0P0TTKKK ? 	34@=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2HII	#112FH^__"6@BBD0:d%6T4?TrrPT]33(44 	@ 	@OA|'lO)=a)@&# I$58H$H!(L%'F /+E /#"3-  M *!,M  @=#3"55? @(]1-=,??(--m<<]33   	E 1]4D D 	 
 
 "#%"(
 
 
 
 
 
 9+++%1
 
 
 	
r5   Fr   r#   input_tensorr   r   r  c           	      $   | j         j        dk    r||dk                                    r|S d S | j         j        dk    r+t          |t          j                  rt          |          }|S ||                                nd}||j        nd}| j         j        dk    r#|s!|st          j
        |||| j                  rd S |j        }|j        d         }	|r|                                }
n/t          |t          j                  r|j        d	         n||	z   dz   }
|                     ||	|
|||j        d         
          }| j         j        dk    r@|>|j        j        dv r0|s.t	          j        |          j        }t          j        ||          }|S )Nflash_attention_2r&  flex_attentionr   Fsdpa)re  ry  is_trainingr!   r8   )sequence_lengthtarget_lengthr?   r   r   )cudaxpunpu)rL   _attn_implementationr  r^   r,   r_   r$   r   is_compileabler   _ignore_causal_mask_sdpar   r?   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer  r   _unmask_unattended)r0   r   r  r   r   r  past_seen_tokensusing_compilable_cacher?   r  r  r   	min_dtypes                r4   ro  zUMT5Stack._update_causal_mask.  s    ;+/BBB)~/D.I.I.K.K)%%4;+/???.%,77 M!<^!L!L!!
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5]n5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr5   r  r  r?   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer?   r   r!   )diagonalrf  r8   r   )r   r,   r  r   rA  r   triur   reshapeexpandrD  r   r:   masked_fill)r   r  r  r?   r   r   kwargsr   r  mask_lengthpadding_masks              r4   r  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionr  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r5   r]   )NNNNNNNNNNNNN)F)rE   rF   rG   r*   ra  rD   r   r,   r_   r   r   ro  staticmethodr   r?   r  rH   rI   s   @r4   rQ  rQ  r  sU       
 
 
 
 
 
+ + +
 "#!!i
 i
 i
 i
d #(B BelK78B lB 	B
 B  B B B BH 444 4 {	4
 4 4 4 4 \4 4 4 4 4r5   rQ  c            &       B    e Zd ZU dZdZeed<   ddgZ fdZd Z	d Z
d	 Zd
 Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f"d            Z xZS ) r/  ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5rL   encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    t                                          |           t          j        |j        |j                  | _        t          j        |          }d|_	        d|_
        d|_        t          || j                  | _        t          j        |          }d|_	        d|_        |j        |_        t          || j                  | _        |                                  d S NFT)r)   r*   r   r   
vocab_sizerR   r3  copydeepcopyry   r
  tie_encoder_decoderrQ  encodernum_decoder_layersrY  decoderr]  r0   rL   encoder_configdecoder_configr3   s       r4   r*   zUMT5Model.__init__  s       l6#4fnEEv..$)!#( -2* ==v..$(!-2*$*$=! == 	r5   c                     | j         S r]   r3  r0   s    r4   get_input_embeddingszUMT5Model.get_input_embeddings  
    {r5   c                 |    || _         | j                            |           | j                            |           d S r]   r3  r  ra  r  r_  s     r4   ra  zUMT5Model.set_input_embeddings  ;    $)).999)).99999r5   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r]   rL   r6  _tie_or_clone_weightsr  rW  r3  r  r  s    r4   _tie_weightszUMT5Model._tie_weights  \    ;* 	O&&t|'@$+NNN&&t|'@$+NNNNN	O 	Or5   c                     | j         S r]   r  r  s    r4   get_encoderzUMT5Model.get_encoder  
    |r5   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   	attentionprune_headsr0   heads_to_pruner   headss       r4   _prune_headszUMT5Model._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr5   Nr  r   r  r   rt  decoder_head_maskru  encoder_outputsr   re  decoder_inputs_embedsr
  r  rl  rv  r   r   c                 *   ||n| j         j        }||n| j         j        }||                     |||
||||          }ne|rct	          |t
                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }|                     ||||	|||||||||          }|s||z   S t          |j	        |j
        |j        |j        |j        |j	        |j        |j                  S )	a+  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r   re  rt  r  rl  rv  r   r!   r7   ri  rB   rj  r  r   re  r   r   r  rt  ru  r
  r  rl  rv  r   )ri  r   decoder_hidden_statesdecoder_attentionsrk  encoder_last_hidden_stater   encoder_attentions)rL   r
  rm  r  r^   r   lenr  r   ri  r   rB   rj  rk  )r0   r  r   r  r   rt  r  ru  r  r   re  r  r
  r  rl  rv  r   rB   decoder_outputss                      r4   rD   zUMT5Model.forward  s   b "+!6IIDK<Q	%0%<kk$+B] ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (* ,,'1/+"/#1'!5/!5#) ' 
 
   	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r5   NNNNNNNNNNNNNNNN)rE   rF   rG   r   
model_typer"   rH  _tied_weights_keysr*   r  ra  r  r  r  r   r   r,   
LongTensorFloatTensor
BoolTensorr_   rs  r   r   r   r   rD   rH   rI   s   @r4   r/  r/    sS         " J79VW    (  : : :O O O  C C C  156:8<=A159=7;EI+/048<$(,0/3&*59#D
 D
E,-D
 !!23D
 $E$45	D

 !))9 :D
 E-.D
 $E$56D
 'u|4D
 "%e.?(@"ABD
 "%D
  -D
  (5D
 D>D
 $D>D
 'tnD
  d^!D
" !!12#D
$ 
uU&');;	<%D
 D
 D
 ^D
 D
 D
 D
 D
r5   r/  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc            (       b    e Zd ZdZdZg dZ fdZd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         deej                 deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f$d            Zdej        fdZ xZS )r0  a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  )r  r  zlm_head.weightc                 4   t                                          |           |j        | _        t	          j        |j        |j                  | _        t          j	        |          }d|_
        d|_        d|_        t          || j                  | _        t          j	        |          }d|_
        d|_        |j        |_        t          || j                  | _        t	          j        |j        |j        d          | _        |                                  d S )NFTrO   )r)   r*   rR   	model_dimr   r   r  r3  r  r  ry   r
  r  rQ  r  r  rY  r  rQ   r(  r]  r  s       r4   r*   z%UMT5ForConditionalGeneration.__init__  s       l6#4fnEEv..$)!#( -2* ==v..$(!-2*$*$=! ==y1BOOO 	r5   c                     | j         S r]   r  r  s    r4   r  z1UMT5ForConditionalGeneration.get_input_embeddings  r  r5   c                 |    || _         | j                            |           | j                            |           d S r]   r  r_  s     r4   ra  z1UMT5ForConditionalGeneration.set_input_embeddings  r  r5   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r]   r  r  s    r4   r  z)UMT5ForConditionalGeneration._tie_weights  r  r5   c                     | j         S r]   r  r  s    r4   r  z(UMT5ForConditionalGeneration.get_encoder  r  r5   Nr  r   r  r   rt  r  ru  r  r   re  r  labelsr
  r  rl  rv  r   r   c                    ||n| j         j        }||n| j         j        }||                     |||
||||          }ne|rct	          |t
                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }||||                     |          }|                     ||||	|||||||||          }|d         }| j         j	        r|| j
        dz  z  }|                     |          }d}|pt          d	
          }|                    |j                  } ||                    d|                    d                    |                    d                    }|s|f|dd         z   |z   }||f|z   n|S t#          |||j        |j        |j        |j        |j        |j        |j        	  	        S )aK  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr  r   r!   r7   r  r  r*  r=  ignore_indexr8   	losslogitsr   r  r  rk  r  r   r  )rL   r
  rm  r  r^   r   r  rG  r  r6  r  r(  r   r:   r   r   r   r   r   rB   rj  rk  ri  )r0   r  r   r  r   rt  r  ru  r  r   re  r  r  r
  r  rl  rv  r   rB   r  sequence_output	lm_logitsr  loss_fctoutputs                            r4   rD   z$UMT5ForConditionalGeneration.forward  so   j "+!6IIDK<Q	%0%<kk$+B] ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (*"3";@U@] $ 1 1& 9 9 ,,'1/+"/#1'!5/!5#) ' 
 
  *!,;* 	G .1EFOLL11	'T:::HYYy/00F8INN2y~~b/A/ABBFKKPROOTTD 	F\OABB$77/IF)-)9TGf$$vE+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r5   c                 ,    |                      |          S r]   )rG  )r0   r  s     r4   %prepare_decoder_input_ids_from_labelszBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsa  s      (((r5   )NNNNNNNNNNNNNNNNN)rE   rF   rG   r   r  r  r*   r  ra  r  r  r   r   r,   r  r  r  r_   rs  r   r   r   r   rD   r  rH   rI   s   @r4   r0  r0  z  sc          Jiii    0  : : :O O O    156:8<=A159=7;@D+/59=A-1$(,0/3&*59%_
 _
E,-_
 !!23_
 $E$45	_

 !))9 :_
 E-._
 $E$56_
 'u|4_
 "%el(;"<=_
 "%_
   12_
  ((9:_
 )*_
 D>_
 $D>_
  'tn!_
" d^#_
$ !!12%_
& 
uU&'8	9'_
 _
 _
 ^_
D)EL ) ) ) ) ) ) ) )r5   r0  c                   4    e Zd ZdZdZdgZ fdZd Zd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee         dee         dee         deeej                 ef         fd            Z xZS )r1  a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  c                 2   t                                          |           t          j        |j        |j                  | _        t          j        |          }d|_	        d|_
        t          || j                  | _        |                                  d S NF)r)   r*   r   r   r  rR   r3  r  r  r
  rn  rQ  r  r]  )r0   rL   r  r3   s      r4   r*   zUMT5EncoderModel.__init__y  s}       l6#4fnEEv..#( ,1) == 	r5   c                     | j         S r]   r  r  s    r4   r  z%UMT5EncoderModel.get_input_embeddings  r  r5   c                 H    || _         | j                            |           d S r]   )r3  r  ra  r_  s     r4   ra  z%UMT5EncoderModel.set_input_embeddings  s%    $)).99999r5   c                 l    | j         j        r'|                     | j        j        | j                   d S d S r]   )rL   r6  r  r  rW  r3  r  s    r4   r  zUMT5EncoderModel._tie_weights  s?    ;* 	O&&t|'@$+NNNNN	O 	Or5   c                     | j         S r]   r  r  s    r4   r  zUMT5EncoderModel.get_encoder  r  r5   c                     |                                 D ]:\  }}| j        j        |         j        d         j                            |           ;dS )r  r   N)r  r  rZ  r   r   r  r  s       r4   r  zUMT5EncoderModel._prune_heads  s]    
 +0022 	P 	PLE5Lu%+A.<HHOOOO	P 	Pr5   Nr  r   rt  re  r  rl  rv  r   c           	      ^    ||n| j         j        }|                     |||||||          }|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )rL   rm  r  )	r0   r  r   rt  re  r  rl  rv  r  s	            r4   rD   zUMT5EncoderModel.forward  sM    F &1%<kk$+B],,)'/!5# ' 
 
 r5   )NNNNNNN)rE   rF   rG   r   r  r  r*   r  ra  r  r  r  r   r   r,   r  r  r   r   rs  r   rD   rH   rI   s   @r4   r1  r1  e  sk         J78
 
 
 
 
  : : :
O O O
  P P P  156:1559,0/3&*- -E,-- !!23- E-.	-
   12- $D>- 'tn- d^- 
uU&'8	9- - - ^- - - - -r5   r1  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $           e Zd ZdgZddgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j
                 dee	j                 dee	j                 dee	j                 deee	j                          dee	j                 dee	j                 dee	j
                 dee         dee         dee         dee         deeef         f d            Z xZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rL   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d| _        d S r  )r)   r*   r/  r  r  classification_headr]  model_parallelr[   s     r4   r*   z&UMT5ForSequenceClassification.__init__  s[       $V,,#9&#A#A  	#r5   Nr  r   r  r   rt  r  ru  r  re  r  r  r
  r  rl  rv  r   c                 p   ||n| j         j        }|d}||	t          d| j        j                   |(|
&|t          d          |                     |          }|                     |||||||||	|
||||          }|d         }|                    | j         j	                  
                    |j                  }t          t          j        |                    d                              dk    rt          d          |j        \  }}}||ddf                             |d	|          ddd	ddf         }|                     |          }d}||
                    |j                  }| j         j        p| j         j        dk    rd
| j         _        nS| j         j        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        d
k    r\t1                      }| j         j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rLt5                      } ||                    d	| j         j                  |                    d	                    }n*| j         j        dk    rt7                      } |||          }|s|f|dd         z   }||f|z   n|S t9          |||j        |j        |j        |j         |j!        |j"        |j#        	  	        S )as
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r  r   rt  r  ru  r  re  r  r
  r  rl  rv  r   r!   z7All examples must have the same number of <eos> tokens.r8   
regressionsingle_label_classificationmulti_label_classificationr  )$rL   rm  NotImplementedErrorr3   rE   r@  rG  r  eqeos_token_idr:   r   r  r,   unique_consecutivesumr   r   r  problem_typer  r?   r   r   r   squeezer   r   r   r   r  r  rk  r  r   r  )r0   r  r   r  r   rt  r  ru  r  re  r  r  r
  r  rl  rv  r   r  eos_maskr   r}  r1   sentence_representationr  r  r  r  s                              r4   rD   z%UMT5ForSequenceClassification.forward  s   | &1%<kk$+B]I!:%d4>Kbdd   $)>)F  U  
 !% 1 1) < <"")/#9/!5+'"7/!5# # 
 
  "!*<< 899<<_=STTu'Q8899A==VWWW%4%:"
A{"1(AAA+">"C"CJPRT_"`"`abababdfhihihiai"j))*ABBYYv}--F{'/;)Q../;DK,,[+a//V\UZ5O5OSYS_chclSlSl/LDK,,/KDK,{'<77"99;)Q..#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB0F G GUWYY)-III,..x// 	FY,F)-)9TGf$$vE.#3")"?&9$5&-&G")"?&9

 

 

 
	
r5   )NNNNNNNNNNNNNNN)rE   rF   rG   "_keys_to_ignore_on_load_unexpectedr  r"   r*   r   r   r,   r  r_   r   r  r   r   rs  r   rD   rH   rI   s   @r4   r  r    s        +s)s&79VW$z $ $ $ $ $ $  15158<=A,0487;=A59=A-1$(,0/3&*!P
 P
E,-P
 !.P
 $E$45	P

 !))9 :P
 EL)P
 $EL1P
 'u|4P
 "$u'8"9:P
   12P
  ((9:P
 )*P
 D>P
 $D>P
 'tnP
  d^!P
" 
u55	6#P
 P
 P
 ^P
 P
 P
 P
 P
r5   r  c                   6    e Zd ZdgZdgZdef fdZe	 	 	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 d	ee	j
                 d
ee	j
                 dee         dee         dee         deee	j
                 ef         fd            Z xZS )r8  r  z'transformer.encoder.embed_tokens.weightrL   c                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r]   )r)   r*   r  r1  r  r   rV   r  rX   rQ   r1   r+  r]  r[   s     r4   r*   z#UMT5ForTokenClassification.__init__  sz        ++F33z&";<<)F$68IJJ 	r5   Nr  r   rt  re  r  r  rl  rv  r   c	           	         ||n| j         j        }|                     |||||||          }	|	d         }
|                     |
          }
|                     |
          }d}|Ft                      } ||                    d| j                  |                    d                    }|s||	dd         f}||f|z   n|S t          |||	j	        |	j
                  S )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   rt  re  r  rl  rv  r   r8   r7   )r  r  rB   rj  )rL   rm  r  rX   r+  r   r   r  r   rB   rj  )r0   r  r   rt  re  r  r  rl  rv  r   rB   r  r  r  r  s                  r4   rD   z"UMT5ForTokenClassification.forward  s   6 &1%<kk$+B]"")'/!5# # 
 
  
]33//'))H8FKKDO<<fkk"ooNND 	Fgadm,F)-)9TGf$$vE$!/)	
 
 
 	
r5   )NNNNNNNN)rE   rF   rG   r  r  r"   r*   r   r   r,   r_   r   r   rs  r   rD   rH   rI   s   @r4   r8  r8  {  s.       *r)s&CD	z 	 	 	 	 	 	  -115,004)-,0/3&*7
 7
EL)7
 !.7
 EL)	7

  -7
 &7
 $D>7
 'tn7
 d^7
 
uU\"$99	:7
 7
 7
 ^7
 7
 7
 7
 7
r5   r8  c            &       2    e Zd ZddgZ fdZd Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	e
ej                 d
e
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
eeej                                   de
ej                 de
ej                 de
ej                 de
ej                 de
e         de
e         de
e         de
e         deeej                 ef         f"d            Z xZS )r2  r  r  c                 H   t                                          |           |j        | _        t	          j        |j        |j                  | _        t          j	        |          }d|_
        d|_        d|_        t          || j                  | _        t          j	        |          }d|_
        d|_        |j        |_        t          || j                  | _        |j        | _        t	          j        |j        |j                  | _        |                                  d S r  )r)   r*   rR   r  r   r   r  r3  r  r  ry   r
  r  rQ  r  r  rY  r  r  rQ   r)  r]  r  s       r4   r*   z!UMT5ForQuestionAnswering.__init__  s       l6#4fnEEv..$)!#( -2* ==v..$(!-2*$*$=! == +)FNF4EFF 	r5   c                     | j         S r]   r  r  s    r4   r  z-UMT5ForQuestionAnswering.get_input_embeddings  r  r5   c                 |    || _         | j                            |           | j                            |           d S r]   r  r_  s     r4   ra  z-UMT5ForQuestionAnswering.set_input_embeddings  r  r5   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r]   r  r  s    r4   r  z%UMT5ForQuestionAnswering._tie_weights  r  r5   c                     | j         S r]   r  r  s    r4   r  z$UMT5ForQuestionAnswering.get_encoder  r  r5   Nr  r   r  r   rt  r  ru  r  start_positionsend_positionsre  r  r
  r  rl  rv  r   c                 4   ||n| j         j        }||n| j         j        }|	|
d}|(|&|t          d          |                     |          }||n| j         j        }||n| j         j        }||                     |||||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }| 	                    |||d||||||||	          }|d         }| 
                    |          }|                    dd
          \  }}|                    d
                                          }|                    d
                                          }d}|	|
t          |	                                          dk    r-|	                    d
                              |j                  }	t          |
                                          dk    r-|
                    d
                              |j                  }
|                    d          }|	                    d|          }	|
                    d|          }
t%          |          } |||	          } |||
          }||z   dz  }|s||f|dd         z   |z   }||f|z   n|S t'          ||||j        |j        |j        |j        |j        |j        |j        
  
        S )aI	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NFr  r  r   r!   r7   r  )r  r   re  r   r   r  rt  ru  r
  r  rl  rv  r8   r   r  )
r  start_logits
end_logitsr   r  r  rk  r  r   r  )rL   rm  r
  r@  rG  r  r^   r   r  r  r)  splitr  r   r   r:   r   r  r   r   r   rB   rj  rk  ri  )r0   r  r   r  r   rt  r  ru  r  r  r   re  r  r
  r  rl  rv  rB   r  r  r  r"  r#  
total_lossignored_indexr  
start_lossend_lossr  s                                r4   rD   z UMT5ForQuestionAnswering.forward  s   x &1%<kk$+B]!*!6IIDK<Q	&=+DI
 $)>)F  U  
 !% 1 1) < <!*!6IIDK<Q	%0%<kk$+B] ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (* ,,'1/ "/#1'!5/!5# ' 
 
 *!,11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="="@"@AT"U"U=%%''((1,, - 5 5b 9 9 < <Z=N O O(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J//!""2EEWF/9/EZMF**6Q2%!+;"1"?.9,=&5&G"1"?.9
 
 
 	
r5   r  )rE   rF   rG   r  r*   r  ra  r  r  r   r   r,   r  r  r  r_   rs  r   r   r   rD   rH   rI   s   @r4   r2  r2    s'       79VW    2  : : :O O O    156:8<=A159=7;@D6:4859=A$(,0/3&*#Z
 Z
E,-Z
 !!23Z
 $E$45	Z

 !))9 :Z
 E-.Z
 $E$56Z
 'u|4Z
 "%el(;"<=Z
 "%"23Z
   01Z
   12Z
  ((9:Z
 D>Z
 $D>Z
  'tn!Z
" d^#Z
$ 
uU&')LL	M%Z
 Z
 Z
 ^Z
 Z
 Z
 Z
 Z
r5   r2  )r1  r0  r2  r  r8  r/  r  )Hr   r  r   typingr   r   r,   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.deprecationr    configuration_umt5r"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerrE   r   Moduler&   rK   rd   rl   rv   r   r   r   r  r  rQ  r/  r0  r1  r  r8  r2  __all__rO  r5   r4   <module>r:     s       " " " " " " " "        A A A A A A A A A A ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) > > > > > > 9 9 9 9 9 9                  . - - - - -                  1 0 0 0 0 0 * * * * * *  !! K;;;;;;JJJJJJ		H	%	%+ + + + +BI + + +4    	   .    RY   <    ")   $D) D) D) D) D)BI D) D) D)N    RY   :    bi   >G G G G G* G G GV    RY   $ o! o! o! o! o!/ o! o! o!dv v v v v# v v vr	 K
 K
 K
 K
 K
# K
 K
 K
\   
c) c) c) c) c)#6 c) c) 
c)L i i i i i* i i iX   `
 `
 `
 `
 `
$7 `
 `
 `
F I
 I
 I
 I
 I
!4 I
 I
 I
X J
 J
 J
 J
 J
2 J
 J
 J
Z  r5   