
     `i                    4   d Z ddlZddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-  e&            rddl.m/Z/ ddl0m1Z1  e)j2        e3          Z4dUdej5        de6de6de6dej5        f
dZ7dej5        de6de6dej5        fdZ8dUdej5        de6de6de6dej5        f
dZ9de6dej5        fdZ:d ej5        de6dej5        fd!Z;d"ej5        de6d#ej<        dej5        fd$Z=d"ej5        d%e6de>ej5        ej5        f         fd&Z?d"ej5        d%e6dej5        fd'Z@d(ej5        d)ej5        d*e6dej5        fd+ZA G d, d-e	jB                  ZC	 dd.lDmEZE eEZCe4F                    d/           n&# eG$ r Y neH$ r e4I                    d0           Y nw xY w G d1 d2e	jB                  ZJ G d3 d4e	jB                  ZK G d5 d6e	jB                  ZL G d7 d8e	jB                  ZM G d9 d:e	jB                  ZN G d; d<e	jB                  ZO G d= d>e	jB                  ZP G d? d@e	jB                  ZQ G dA dBe	jB                  ZR G dC dDe	jB                  ZS G dE dFe          ZTe% G dG dHe                      ZU G dI dJeU          ZVdKZWe% G dL dMeU                      ZX e%dNO           G dP dQeUe                      ZYe% G dR dSeU                      ZZg dTZ[dS )VzPyTorch LongT5 model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 j   | j         |          |z  }t          | j                   s?t          | j                   }||xx         |z  cc<   t          j        || j                  S dg| j        z  }d|f||<   t          |ddd         d          }t          j	        
                    | |d|          } | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr*   ndimsumr   
functionalr0   )r#   r$   r%   r&   pad_len	new_shaper0   s          ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler>   @   s    ws|mi'Gqw<< 5MM	#'!{9AG4444(QV
C7|CH
c$$B$i

C
!:YGGAH    c                 2   | j         |         |z  dk    rt          | ||d          } | j         |         |z  }| j         d|         ||fz   | j         |dz   d         z   }d|v r!t          j        || j        | j                  S |                     |          S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r&   Nr   r*   device)r3   r>   r6   emptyr*   rB   reshape)r#   r$   r%   
num_blocksoutput_shapes        r=   _split_into_blocksrG   P   s    
 	ws|i1$$Q	3!<<<*J74C4=J	#::QWcAg[[=QQLL{<qwqxHHHH99\"""r?   	block_dimsequence_dimc                    | j         |         }dg| j        z  }d||<   t          |ddd         d          }t          j                            | |d|          } g }t          d          D ][}t          d	d          g| j        z  }t          |||z             ||<   t          |          }|	                    | |                    \t          j        ||
          S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://huggingface.co/papers/2112.07916.
    r+   )r   r   Nr,   r-   r.   r/   r   r   r%   )r3   r8   r9   r   r:   r0   rangeslicetupleappendr6   cat)	r#   rH   rI   r&   rE   r0   blocks_listiindicess	            r=   _concatenate_3_blocksrT   _   s    
 #J(QV
CC	N
c$$B$i

C
!:YGGA&(K1XX ' ' D>>"QV+"1a*n55	..1W:&&&&9[l3333r?   c                     t          j        d| z  t           j                  }|| |           }|                    d          |                    d          z
  }|S )z:Makes 3-blocked relative position ids for local attention.r   r)   r   r   )r6   arangeint32	unsqueeze)r$   position_idscenter_position_idsrelative_position_idss       r=   "_make_3block_relative_position_idsr\   x   s]    <IU[AAAL&y)';<(221558K8U8UVW8X8XX  r?   local_attention_maskc                     t          |          }t          j        |          |k     }|ddddddf         }|                    | j                  }t          j        | |          S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r\   r6   abstorB   logical_and)r]   r$   r[   locality_masks       r=   _mask_local_attention_maskrc      sk    >yIII344y@M!$aaa"23M!$$%9%@AAM1=AAAr?   attention_maskrB   c                 8   t          | |d          }t          |dd          }|                    d          }|                    d          }t          j        ||          }t          ||          }|                    d                              |          S )z;Prepare attention mask to be applied for a local attention.r   rK      rH   rI   r,   )rG   rT   rX   r6   ra   rc   r`   )rd   r$   rB   _blocked_attention_mask_3blocked_attention_maskr]   s         r=   _get_local_attention_maskrk      s     1PQRRR45LXYhijjj5??CC7AA"EE ,-DF^__56JIVV))!,,//777r?   global_block_sizec                 ^   | j         dd         \  }dt          j        dt          j        ffd}t          j        | | j                  z  }t          j        |d          |z
  }t          j        | d	k    d
d                              | j                  }t          j	        ||z   d
z
                                | j                  }t          j
        d|j        |j                  }t          j        ||k    ||          }|| z  | dz
  z   } ||          }z  }|dk    rDt          j        |d          j                            |d                              dd          }	n"t          j        |d|j        |j                  }	t          j        t          j        ||          d          dz
  }
|
                    | j                  }
t          j        |
|	k    dd          }
|                    t          j                  |
                    t          j                  fS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nrf   	block_idsr'   c                 d   t          j                  z  dz
  k    }|                    | j                  }t          j        || dk              }|                    d                              d                              | j                  dz
  }t          j	        | |k     | |          } | S )Nr   r   r,   )
r6   rV   r`   rB   ra   r9   rX   typer*   where)rn   
block_endstrue_block_endsfull_blocksrl   seq_lens       r=   handle_orphan_tokensz:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    l7++.??DUXYDYY
]]9#344
+J	QGG%))"--77;;@@QQTUUK	K 7KPP	r?   rB   r   )axis              ?g     @r,   rA   r   rK   )r3   r6   Tensor	ones_likerB   cumsumrq   rp   r*   floortensormaxvaluesrepeat	transposer7   onesr`   int)rd   rl   
batch_sizerv   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsru   s    `         @r=   _make_global_fixed_block_idsr      sP    ).rr2J         ~n>STTTWhh|$41===@PP;~,c7;;@@AUVVD{4*:#:S#@AAFF~G[\\$)L;K;QZjZq$r$r$r!{88:JLi  )>9nq>PQ++,<==..KQ"'),<""E"E"E"L"S"ST_ab"c"c"m"mnoqr"s"s"'+!1!7@P@W#
 #
 #
 ej[&I&IrRRRUVV+..~/DEE%7;R%RTUWXYY  ++-?-D-DUY-O-OOOr?   c                     t          | |          \  }}|j        d         }t          j        ||j                  }||d         z
  }|                    t          j                  S )zBCreate the relative position tensor for local -> global attention.r,   rw   .N)r   r3   r6   rV   rB   rp   int64)rd   rl   rn   r   global_seq_lenglobal_positionsside_relative_positions          r=    _make_side_relative_position_idsr      sc    $@Qb$c$c!I!'-b1N|N9;KLLL-	)0DD!&&u{333r?   hidden_statesrn   r   c                 n   |                     |dk    t          j        ||j        |j                            }t
          j                            |                    t          j	                  |dz             ddddddf         }t          j
        d| |                    | j                            S )zFCompute individual block aggregates by summing over individual blocks.r   rA   r   Nr,   z...nd,...ng->...gd)rq   r6   r   r*   rB   r   r:   one_hotrp   r   einsum)r   rn   r   one_hot_block_idss       r=   _create_global_aggregatesr      s    
 Q^9?S\Scddd I --innU[.I.I>\]K]^^_`_`_`bcbcbcehfheh_hi<,m=N=S=STaTg=h=hiiir?   c                   &     e Zd Zd fd	Zd Z xZS )LongT5LayerNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr6   r   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r=   r   zLongT5LayerNorm.__init__   sD     	l5:k#:#:;; #r?   c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )Nrf   r,   T)keepdim)r`   r6   float32powmeanrsqrtr   r   r*   float16bfloat16)r   r   variances      r=   forwardzLongT5LayerNorm.forward   s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r?   )r   )__name__
__module____qualname__r   r   __classcell__r   s   @r=   r   r      sL        $ $ $ $ $ $+ + + + + + +r?   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   *     e Zd Zdef fdZd Z xZS )LongT5DenseActDenseconfigc                 J   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j	                  | _
        t          |j                 | _        d S NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr   r   r   s     r=   r   zLongT5DenseActDense.__init__
  sx    )FNFKeDDD)FKeDDDz&"566&-.r?   c                    |                      |          }|                     |          }|                     |          }t          | j        j        t          j                  r]|j        | j        j        j        k    rC| j        j        j        t          j	        k    r$|
                    | j        j        j                  }|                     |          }|S N)r   r   r   
isinstancer   r   r6   r{   r*   int8r`   )r   r   s     r=   r   zLongT5DenseActDense.forward  s    ..//]33tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r?   r   r   r   r    r   r   r   r   s   @r=   r   r   	  sS        /| / / / / / /      r?   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5DenseGatedActDenser   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r	   r   r   r   s     r=   r   z!LongT5DenseGatedActDense.__init__   s    IfnfkFFF	IfnfkFFF	)FKeDDDz&"566&-.r?   c                     |                      |                     |                    }|                     |          }||z  }|                     |          }|                     |          }|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r=   r   z LongT5DenseGatedActDense.forward(  sb    hhtyy7788		-00#m3]33..r?   r   r   s   @r=   r   r     sS        /| / / / / / /      r?   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5LayerFFr   c                 $   t                                                       |j        rt          |          | _        nt          |          | _        t          |j        |j                  | _	        t          j        |j                  | _        d S )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r=   r   zLongT5LayerFF.__init__3  sx     	>":6"B"BD"5f"="=D)&.f>WXXXz&"566r?   c                     |                      |          }|                     |          }||                     |          z   }|S r   )r   r   r   )r   r   forwarded_statess      r=   r   zLongT5LayerFF.forward=  sF    ??=99../?@@%5E(F(FFr?   r   r   s   @r=   r   r   2  sS        7| 7 7 7 7 7 7      r?   r   c                        e Zd Z	 	 ddedee         f fdZd Zedd
            Z	ddZ
 eddd          	 	 	 	 	 	 	 	 	 dd            Z xZS )LongT5AttentionFNr   	layer_idxc                 P   t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j
        | j        z  | _        || _        |/| j        r(t                              d| j        j         d           t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        | j        r$t'          j        | j        | j
                  | _        t7                      | _        d| _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   s       r=   r   zLongT5Attention.__init__F  su    	 ++F(.4.S+/5/U,~"(+'*(??",4>+B , , ,   4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE&+###r?   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S Nr   r   rK   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexs      r=   prune_headszLongT5Attention.prune_headsi      u::??F74<!8$:K
 
u $DFE22#DFE22#DFE22#DFEq999|c%jj004<? -33E::r?   T       c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rf   r   r`   r6   longr_   min
zeros_likelogfloatmath	full_likerq   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r=   _relative_position_bucketz)LongT5Attention._relative_position_buckety  s>   ,  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r?   c                    || j         j        j        }|,t          j        |t          j        |          dddf         }n|dddf                             |          }t          j        |t          j        |          dddf         }||z
  }|                     || j         | j	        | j
                  }|                      |          }	|	                    g d                              d          }	|	S )%Compute binned relative position biasNrA   r  r  r  rf   r   r   r   )r   r   rB   r6   rV   r
  r`   r  r   r   r   permuterX   )
r   query_length
key_lengthrB   cache_positioncontext_positionmemory_positionr  relative_position_bucketr   s
             r=   compute_biaszLongT5Attention.compute_bias  s   >18?F!$|L
SYZZZ[\[\[\^b[bc-aaag699&AA,zFSSSTXZ[Z[Z[T[\+.>>#'#A#A#.;=	 $B $
 $
  --.FGG			**44Q77r?   past_key_valuepast_key_values4.58new_nameversionc                    |j         dd         \  }}|du}|                     |          }|                    |d| j        | j                                      dd          }d}t          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|N|s|
nd}
|                    ||| j
        d|
i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }||j         d	         }||n
|
d         dz   }| j        s@t'          j        d| j        ||f|j        |j        
          }| j        r| j        rd|_        n3|                     |||j        |
          }|dddd| dddf         }|$|ddddddd|j         d	         f         }||z   }| j        rUt'          j        |j         d                   }d|t?          | j                  <   |dd|                                 f         }n|}||z  }tB          j"        #                    |$                                d          %                    |          }tB          j"        &                    || j&        | j                  }|||z  }t'          j        ||          }|                    dd          '                                }|                    |d| j(                  }| )                    |          }||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nrf   r,   r   Fr"  Tr   rh   rB   r*   )rB   r"  r   rK   ptraining)*r3   r   viewr   r   r   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater6   matmulr   r7   rB   r*   r   r1  requires_gradr&  r   r   r5   boolr   r:   softmaxr  type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biasr(  layer_head_maskr   	use_cacheoutput_attentionsr"  r   
seq_lengthis_cross_attentionquery_statesr3  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr!  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r=   r   zLongT5Attention.forward  su   & "/!4RaR!8
J .T9vvm,,#((RtG^__iijkmnoo 
o':;; 	2(377GGJ! K&5&K##&5&J##"1-?R))] 	F/"=*"=,3DNCHJ.5dnELLL//J66.11L#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL*7I!St+>+E+Edn?OQ_>`, ,(
L & F*_FY*Z*Z FAEO.t~> lJ,@,@A,F,FGG #)"-J.:.FllN[]L^abLbO3 
E %j*=fm[a[g! ! ! . 74= 726M/ $ 1 1#ZVd !2 ! ! !.aaaZKLL!!!.C D"111aaa,Bj.>r.B,B#BC - ; 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 && },,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>!&&z2t~FFff[)). 	0/Gr?   FNTr  r  )NN)	NNNNNNFFN)r   r   r   r    r   r   r   r  staticmethodr  r&  r   r   r   r   s   @r=   r   r   E  s         %*#'	!, !,!, C=	!, !, !, !, !, !,F; ; ;  -  -  -  \- ^   ( _%0A6RRR m m m SRm m m m mr?   r   c                   h     e Zd Zddededdf fdZd Zedd            Zde	fdZ
	 	 	 	 ddZ xZS )LongT5LocalAttentionFr   r   r'   Nc                    t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j        dz   | _        |j        | _        | j
        | j        z  | _        t!          j        | j        | j        d          | _        t!          j        | j        | j        d          | _        t!          j        | j        | j        d          | _        t!          j        | j        | j        d          | _        | j        r$t!          j        | j        | j
                  | _        t1                      | _        d| _        d S )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr$   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s      r=   r   zLongT5LocalAttention.__init__/  sF    ++F(.4.S+/5/U,~"(+'"/*Q.*(?? 4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE&+###r?   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S r   r   r   s      r=   r  z LongT5LocalAttention.prune_headsI  r  r?   Tr  r  c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S r  r	  r  s           r=   r  z.LongT5LocalAttention._relative_position_bucketY  >   .  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r?   block_lengthc                    | j         j        j        j        dk    r| j         j        j        nd}t	          j        d|z  t          j        |          }|||          }|dddf         |dddf         z
  }|                     || j         | j	        | j
                  }|                      |          }|                    g d                              d                              d          }|S r  metaNr   rA   r  r  r   r   r   rB   rp   r6   rV   r
  r  r   r   r   r  rX   r   r^  target_devicer$  r#  r  r%  r   s           r=   r&  z!LongT5LocalAttention.compute_bias      +29>&HH (/66 	
  ,q<'7uzR_```*<+EF ,D!!!G47G47PP#'#A#A#.;=	 $B $
 $
  --.FGG			**44Q77AA!DDr?   c                     |j         d d         \  } fd} fd} |                     |                    }	 |                     |                    }
 |                     |                    }t	          |	 j        d          }	t	          |
 j        d          }
t	          | j        d          }t          |
dd          }
t          |dd          }t          j        d|	|
          }| j	        sNt          j
        dd j         j        d j        z  f|j        |j        	          } j        r j        rd
|_        n                      j                  }|3t          j        |dk    dd          }||                    dd          z   }||z  }t(          j                            |                                d                              |          }t(          j                            | j         j                  }|||z  }|                    |j                  } |t          j        d||                    }|d d d |d d f         }                     |          }||f}|r||fz   }|S )Nrf   c                 H    |                      dj        j                  S 
projectionr,   r2  r   r   statesr   r   s    r=   r3   z+LongT5LocalAttention.forward.<locals>.shape       ;;z2t|T=TUUUr?   c                 `    |                                                      dj                  S rD   r,   r?  r2  r   rk  s    r=   unshapez-LongT5LocalAttention.forward.<locals>.unshape  )    $$&&++JDNKKKr?   r   rK   rg   ...qhd,...khd->...hqkr   r.  Tr   ry       _r,   r/  ...hqk,...khd->...qhd)r3   r   r   r   rG   r$   rT   r6   r   r   r7   r   rB   r*   r   r1  r;  r&  rq   r   r   r:   r=  r  r>  r   rp   r   )r   r   r   rA  rB  rD  rE  r3   rq  rG  rJ  rK  rL  rP  rQ  rR  r   s   `               @r=   r   zLongT5LocalAttention.forward  s    "/!4RaR!8
J	V 	V 	V 	V 	V 	V	L 	L 	L 	L 	L 	L
 uTVVM2233U466-0011
uTVVM2233 *,ANNN'
DNJJJ
),ANNN +:QRSSS
,\QUVWWW #\:
 
  3 B %4<T^9KLU[Ubjpjv! ! ! . 74= 726M/ $ 1 1$. A A{4!8S%88 -q!0D0D D-},,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9L#(();<<gel+BLR^__``!!!![j[!!!"34ff[)) 

  	0/Gr?   FrT  NNNF)r   r   r   r    r<  r   r  rU  r  r   r&  r   r   r   s   @r=   rW  rW  .  s        , ,| ,$ ,[_ , , , , , ,4; ; ;  -  -  -  \- ^    6 K K K K K K K Kr?   rW  c                        e Zd Zddededdf fdZd Zedd            Zde	fdZ
dej        dej        dej        fdZ	 	 	 	 ddZ xZS )LongT5TransientGlobalAttentionFr   r   r'   Nc                    t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j        dz   | _        |j        | _        |j        | _        | j
        | j        z  | _        t#          j        | j        | j        d          | _        t#          j        | j        | j        d          | _        t#          j        | j        | j        d          | _        t#          j        | j        | j        d          | _        | j        r$t#          j        | j        | j
                  | _        t3                      | _        | j        r$t#          j        | j        | j
                  | _        t9          |j        |j                  | _        d S )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   rY  r$   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normrZ  s      r=   r   z'LongT5TransientGlobalAttention.__init__  s    ++F(.4.S+/5/U,~"(+'"/*Q.!'!9*(?? 4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE + 	r24,t?bdhdp2q2qD/'6v~6Kd'e'e'e$$$r?   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S r   r   r   s      r=   r  z*LongT5TransientGlobalAttention.prune_heads  r  r?   Tr  r  c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S r  r	  r  s           r=   r  z8LongT5TransientGlobalAttention._relative_position_bucket   r]  r?   r^  c                    | j         j        j        j        dk    r| j         j        j        nd}t	          j        d|z  t          j        |          }|||          }|dddf         |dddf         z
  }|                     || j         | j	        | j
                  }|                      |          }|                    g d                              d                              d          }|S r`  rb  rc  s           r=   r&  z+LongT5TransientGlobalAttention.compute_biasQ  re  r?   r   r   c                 ~   t          j        |d         |d d d d d f                   d d d df         }t          j        |dk    dd          }t          || j                  }|                     || j         | j        | j                  }| 	                    |          }|
                    g d          }||z   }|S )Nr   .r   ry   rt  r  )r   r   r   rf   )r6   eqrq   r   rl   r  r   r   r   r{  r  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r=   compute_side_biasz0LongT5TransientGlobalAttention.compute_side_biasi  s    #htI8J111dTUTUTU:8VWWXYXYXY[_adXde#k*=*A3NN!A$H^!_!_(,(F(F"#.;=	 )G )
 )
% 778UVV	 %%lll33	1I=""r?   c                 `	    |j         d d         \  } fd} fd}t          ||n t          j        |j         d d                    j                  \  }	}
|
j         d         }t          ||	|          }                     |          } |                     |                    } |                     |                    } | 	                    |                    } |                     |                    } | 	                    |                    }t          | j        d          }t          | j        d          }t          | j        d          }t          |dd          }t          |dd          }dg|j        dz   z  }|j         d         |d<   |                    d                              |          }|                    d                              |          }t          j        ||gd          }t          j        ||gd          }t          j        d||          }|6t%          | j        |j                  }t          j        |d	k    d
d          }nd }|F j        sNt          j        dd j         j        d j        z  f|j        |j                  } j        r j        rd|_        n                      j                  }|||                    dd          z   }|                    |j                  }|t          j        |          }                     ||
          }t          | j        d                              dd          }|                    |j                                       |j                  }t          j        ||gd          }||z  }tB          j"        #                    |$                                d          %                    |          }tB          j"        &                    | j&         j                  }|||z  }|                    |j                  } |t          j        d||                    }|d d d |d d f         } '                    |          }||f}|r||fz   }|S )Nrf   c                 H    |                      dj        j                  S rh  rj  rk  s    r=   r3   z5LongT5TransientGlobalAttention.forward.<locals>.shape  rm  r?   c                 `    |                                                      dj                  S ro  rp  rk  s    r=   rq  z7LongT5TransientGlobalAttention.forward.<locals>.unshape  rr  r?   r,   r   rK   rg   rs  r   ry   rt  r   r.  Trh   r/  ru  )(r3   r   r6   r   rl   r   r|  r   r   r   rG   r$   rT   r8   rX   r   rP   r   rk   rB   rq   r   r7   r   r*   r   r1  r;  r&  r   rp   r  r`   r   r:   r=  r  r>  r   r   )r   r   r   rA  rB  rD  rE  r3   rq  rn   r   _global_seq_lenglobal_inputsrG  rJ  rK  side_key_statesside_value_statesrepsrL  r]   side_position_biasrP  rQ  rR  r   s   `                        @r=   r   z&LongT5TransientGlobalAttention.forward~  s    "/!4RaR!8
J	V 	V 	V 	V 	V 	V	L 	L 	L 	L 	L 	L )E$DD%*]5H"5M*N*N")
 )
%	%
 -2261-O\\44]CC uTVVM2233U466-0011
uTVVM2233%} 5 566!E$&&"7"788 *,ANNN'
DNJJJ
),ANNN +:QRSSS
,\QUVWWW so*Q./"1%Q)33A66==dCC-77::AA$GG Y
O<!DDD
y,0A!BJJJ 5|ZPP#<T4>S`Sg#h#h #(;/Ca/Ge#T#T  #'  3 	B %4<T^9KL!= ,! ! !
 . 74= 726M/ $ 1 1$. A A#/ -0D0N0NqRS0T0T T)..v|<<M |z*j99!%!7!7>P!Q!Q!34F\^!_!_!_!i!ijkmn!o!o!3!8!8!F!F!I!I&-!X!X!I}6H&IrRRRM-},,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9L#(();<<gel+BLR^__``!!!![j[!!!"34ff[)). 	0/Gr?   rv  rT  rw  )r   r   r   r    r<  r   r  rU  r  r   r&  r6   r{   r  r   r   r   s   @r=   ry  ry    s       f f| f$ f[_ f f f f f f>; ; ;  -  -  -  \- ^    0#el # #Y^Ye # # # #0 u u u u u u u ur?   ry  c                   p     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 dd	            Z xZS )LongT5LayerSelfAttentionFNr   c                     t                                                       t          |||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   s       r=   r   z!LongT5LayerSelfAttention.__init__  sl    ,0KW`
 
 
 *&.f>WXXXz&"566r?   r'  r(  r)  r*  c	           
          |                      |          }	|                     |	|||||||          }
||                     |
d                   z   }|f|
dd          z   }|S )N)r   rA  rB  r(  rC  rD  r"  r   r   )r   r  r   )r   r   rd   rA  rB  r(  rC  rD  r"  normed_hidden_statesattention_outputrR  s               r=   r   z LongT5LayerSelfAttention.forward   s      $}==-- '++/) . 	
 	
 &5Ea5H(I(II "%5abb%99r?   rS  )NNNNFFN	r   r   r   r   r   r   r   r   r   r   s   @r=   r  r    s        7 7XVY] 7 7 7 7 7 7 _%0A6RRR    SR    r?   r  c                   L     e Zd ZdZddee         f fdZ	 	 	 	 d	defdZ xZ	S )
LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                     t                                                       t          ||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S N)r   r   )r   r   rW  LocalSelfAttentionr   r   r   r   r   r   r   r   r   s       r=   r   z&LongT5LayerLocalSelfAttention.__init__  s`    "6v[v"w"w"w)&.f>WXXXz&"566r?   kwargsc                     |                      |          }|                     |||||          }||                     |d                   z   }|f|dd          z   }	|	S N)r   rA  rB  rD  r   r   )r   r  r   
r   r   rd   rA  rB  rD  r  r  r  rR  s
             r=   r   z%LongT5LayerLocalSelfAttention.forward%  sz      $}==22 '+/ 3 
 
 &5Ea5H(I(II "%5abb%99r?   rS  rw  
r   r   r   __doc__r   r   r   r   r   r   r   s   @r=   r  r    s        ..7 7XVY] 7 7 7 7 7 7          r?   r  c                   L     e Zd ZdZddee         f fdZ	 	 	 	 d	defdZ xZ	S )
'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                     t                                                       t          ||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S r  )r   r   ry  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   s       r=   r   z0LongT5LayerTransientGlobalSelfAttention.__init__>  si    ,J0K-
 -
 -
) *&.f>WXXXz&"566r?   r  c                     |                      |          }|                     |||||          }||                     |d                   z   }|f|dd          z   }	|	S r  )r   r  r   r  s
             r=   r   z/LongT5LayerTransientGlobalSelfAttention.forwardF  sz      $}==<< '+/ = 
 
 &5Ea5H(I(II "%5abb%99r?   rS  rw  r  r   s   @r=   r  r  ;  s        997 7XVY] 7 7 7 7 7 7          r?   r  c                   r     e Zd Zd
dee         f fdZ eddd          	 	 	 	 	 	 	 	 dd	            Z xZS )LongT5LayerCrossAttentionNr   c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r=   r   z"LongT5LayerCrossAttention.__init__^  sc    .vSXdmnnn)&.f>WXXXz&"566r?   r'  r(  r)  r*  Fc                     |                      |          }|                     |||||||||	|

  
        }||                     |d                   z   }|f|dd          z   }|S )N)	r   r@  rA  rB  r(  rC  r   rD  r"  r   r   )r   r  r   )r   r   r@  rd   rA  rB  r(  rC  r   rD  r"  r  r  layer_outputrR  s                  r=   r   z!LongT5LayerCrossAttention.forwardd  s      $}==// -'++%/) 0 
 
 %t||4DQ4G'H'HH/$4QRR$88r?   r   )NNNNFNFNr  r   s   @r=   r  r  ]  s        7 7(3- 7 7 7 7 7 7 _%0A6RRR
    SR    r?   r  c                   z     e Zd Zddee         f fdZ eddd          	 	 	 	 	 	 	 	 	 	 	 	 dd
            Z xZS )LongT5BlockFNr   c                 $   t                                                       |j        | _        |j        rt          }n>|j        dk    rt
          }n+|j        dk    rt          }nt          d|j         d          t          j	                    | _
        | j
                             ||||                     | j        r)| j
                            t          ||                     | j
                            t          |                     d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrO   r  r   )r   r   r   r   attention_layerr   s        r=   r   zLongT5Block.__init__  s     + 
	6OO*g55;OO*.@@@EOO<!8< < <   ]__

OF@[gpqqq	
 	
 	
 ? 	VJ7)TTTUUU
-//00000r?   r'  r(  r)  r*  Tc                     | j         d         |||||	|
||          }|d         }|dd          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }| j	        o|d u}|r | j         d         ||||||	|d         dz   |
||
  
        }|d         }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }||dd          z   } | j         d         |          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }|f|z   S )Nr   )rd   rA  rB  r(  rC  rD  r"  r   i  )r  r   r,   )	r@  rd   rA  rB  r(  r   rC  rD  r"  )
r  r*   r6   r   isinfanyfinfor   clampr   )r   r   rd   rA  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasrB  cross_attn_layer_head_maskr(  rC  rD  return_dictr"  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputss                      r=   r   zLongT5Block.forward  s    " "/A)'++/)	"
 	"
 	"
 /q121226 %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM!_R1Fd1R 	P&3djm!65; : /+B/!3#"3-' ' '# 4A6M "em33M8R8R8V8V8X8X3#k-*=>>BTI %M|Q\ ] ] ] !24KABB4O O '
2}55 %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM 00	
r?   rS  )NNNNNNNNFFTNr  r   s   @r=   r  r    s        1 1XVY] 1 1 1 1 1 1. _%0A6RRR "#&*#'D
 D
 D
 SRD
 D
 D
 D
 D
r?   r  c                   t     e Zd ZU eed<   dZdZdgZdZe	d             Z
d Ze fd            Zd	 Zd
 Z xZS )LongT5PreTrainedModelr   transformerTr  Fc                 v    t          j        t                    }t          j        t                    }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r6   r   r   r   )r   r  
input_maskdummy_inputss       r=   r  z"LongT5PreTrainedModel.dummy_inputs  s?     L..	\*--
!*"&0
 

 r?   c                     | }|                     d          }|                    d          D ]%}t          ||          s d S t          ||          }&|                     || j                   d S )Nz.weightr  )removesuffixsplithasattrgetattr_tie_or_clone_weightsshared)r   keymodulesub_keys       r=   _try_load_missing_tied_modulez3LongT5PreTrainedModel._try_load_missing_tied_module  s{    y))yy~~ 	. 	.G67++ VW--FF""64;77777r?   c                 d   |                     dd          }d|d<    t                      j        |i |\  }}|                     dg           }t          |d          rKt          |d          r;|D ]8}t                              d| d| d	           |                    |           9|r||fS |S )
Noutput_loading_infoFTmissing_keysr  _tied_weights_keysz!Recovering a missing tied weight z2 from a legacy LongT5 checkpoint. Consider saving zF in your checkpoint or updating the config (tie_word_embeddings=true).)r4  r   from_pretrainedr  r   warningr  )	r   argsr  requested_loading_infomodelloading_infor  missing_keyr   s	           r=   r  z%LongT5PreTrainedModel.from_pretrained  s    !',A5!I!I(,$%5egg5tFvFF|#'';;5(## 	A7K(L(L 	A+ A A{ { {'2{ { {   33K@@@@! 	',&&r?   c                 	   | j         j        }t          |t                    r$|j        j                            |dz             dS t          |t          t          t          f          rt|j
        j        j                            d|dz             t          |d          r7| j         j        s-|j        j        j                            d|dz             dS dS dS t          |t                    r|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t,                    rt|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t2          t4          t6          f          rP| j         j        }| j         j        }| j         j        }|j        j        j                            d|||z  dz  z             |j        j        j                            d||dz  z             |j         j        j                            d||dz  z             |j!        j        j                            d|||z  dz  z             |j"        rq|j#        j        j                            d||dz  z             t          |t6                    r2|j$        j        j                            d||dz  z             dS dS dS dS )zInitialize the weightsrz   ry   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelr  normal_r  tie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   rW  ry  r   r   r   r   r   r   r   r   r{  )r   r  factorr   r   r   s         r=   _init_weightsz#LongT5PreTrainedModel._init_weights  s   /fo.. +	M$$Vc\22222.LN` abb )	 M %--3FSL-IIIvy)) O$+2Q O%*22#2NNNNNO O O O 344 #	 I!))s4;CV[_B_8`)aaavy&)) ,fin.H	#))+++I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H 899 	K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H2FHf ghh 	 k)G!%!1k+GHO ((cv'L^B^cgAg7h(iiiHO ((cv$7O(PPPHO ((cv$7O(PPPHO ((cv'L^B^cgAg7h(iii1 .5:BBQW\chl[lQmBnnnf&DEE 9@EMM fT0A&B N     	 	  r?   c                    | j         j        }| j         j        }|t          d          t	          |          rHt          j        |j        d d         dz   |          }t          j        ||dd df         gd          }nD|	                    |j                  }|dd df         
                                |ddd f<   ||d<   |t          d          |                    |d	k    |           |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r,   )r   .rK   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r6   fullr3   rP   	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idss        r=   _shift_rightz"LongT5PreTrainedModel._shift_rightG  s   !%!C{/!)8   Y'' 	? %
9?3B3+?$+FH^ _ _ %	+<iSbS>Q*RXZ [ [ [ ) 3 3IO D D)238)<)B)B)D)Dc122g&(>f%PQQQ&&'8D'@,OOO  r?   )r   r   r   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r  classmethodr  r  r  r   r   s   @r=   r  r    s         %&*#&"  X8 8 8     [$. . .b! ! ! ! ! ! !r?   r  c                        e Zd Zd fd	Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej        df         dej        d	ej        d
e	de
f
dZedej        dededej        d	ej        defd            Z xZS )LongT5StackNc                 (   t                                                     t          j        j        j                  | _        ||j        | j        _        j        | _        j	        | _	        | j	        dz   | _
        t          j        fdt          j                  D                       | _        t          j        j                  | _        t          j        j                  | _        d| _        |                                  d S )Nr   c           	      V    g | ]%}t          t          |d k              |          &S )r   r  )r  r<  ).0rR   r   s     r=   
<listcomp>z(LongT5Stack.__init__.<locals>.<listcomp>p  sC        FQ!VXYZZZ  r?   r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   rY  r$   r  rL   
num_layersblockr   r   final_layer_normr   r   r   r   	post_init)r   r   r  r   s    ` r=   r   zLongT5Stack.__init__d  s      L):FNKK#'3':D$ +"/*Q.]   v011  
 

 !0FD] ^ ^ ^z&"566&+# 	r?   c                     || _         d S r   )r  r   new_embeddingss     r=   set_input_embeddingsz LongT5Stack.set_input_embeddings~  s    *r?   c                 l   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|#|!| j        rdnd}t          d| d| d          |1|                                }|                    d|d                   }n@||                                d d         }n!| j        rdnd}t          d| d| d	          | j	        r%| j
        r|	rt                              d
           d}	|&| j        
J d            |                     |          }|\  }}| j        r]|	rZ|X| j         j        r7t          t!          | j                   t!          | j                             }nt!          | j                   }n	| j        sd }||                                nd}|t%          j        |||z   |j                  }|/t+                      s!||z   }t%          j        |||j                  }| j        r6|                     |||t1          |t                    r|j        n||
          }n.| j         j        dk    rt7          || j        |j                  }n|}| j        rQ|O|                                \  }}}||f}|t%          j        ||j                  }|                     |          }nd }|                     || j         j                  }|                     || j         j                  }|rdnd }|
rdnd }|
r	| j        rdnd }d }d }|                      |          }tC          | j"                  D ]{\  } }!||          }"||          }#|r||fz   } |!|||||||"|#||	|
||          }$|$d         }|$d         }| j        r||$|
rdnd         }|
r||$d         fz   }| j        r||$d         fz   }|| #                    |          }|                      |          }|r||fz   }|stI          d |||||fD                       S tK          |||||          S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer,   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddings)r   r   rw   r  r-   )rB  r  r(  rC  rD  r  r"  r   r   rf      c              3      K   | ]}||V  	d S r   r-   )r  r   s     r=   	<genexpr>z&LongT5Stack.forward.<locals>.<genexpr>!  s4       
 
 =  !===
 
r?   )last_hidden_stater(  r   
attentionscross_attentions)&r   rC  rD  output_hidden_statesuse_return_dictr   r  sizer2  r   r1  r   r   r  is_encoder_decoderr   r   get_seq_lengthr6   rV   rB   r   r   _update_causal_maskr   r6  r  rk   r$   invert_attention_maskget_head_maskr  r   	enumerater  r  rN   r   )%r   r  rd   r  r  r  	head_maskcross_attn_head_maskr(  rC  rD  r!  r  r"  err_msg_prefixinput_shaper   rE  past_key_values_lengthmask_seq_lengthrN  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrA  r  r   rR   layer_modulerB  r  layer_outputss%                                        r=   r   zLongT5Stack.forward  s     "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>+/?BZZNw>wwwww   "#..**K!r;r?;;II&',,..ss3KK+/?BZZNu>uuXfuuuvvv& 	"4= 	" "##p   "	 $002p000 --i88M!,
J? 	# G_4;1 G&9$DK888,dk:Z:Z:Z' 'OO '3$+&F&F&FO 	# #OETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !*B*D*D!4zAO"Z
OML`aaaN? 	)22o/BCC%44$! KK [/7::3NDNTaThiiKK(K ? 	34@=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2HII	#112FH^__"6@BBD0:d&7VDOVrrRV(,%]33(44 &	V &	VOA|'lO)=a)@&# I$58H$H!(L%/- /+E /#"3'-  M& *!,M
 *!,M ]#8#D0=CT>[aaZ[0\-  V!/=3C2E!E? V+?=QRCSBU+U(--m<<]33   	E 1]4D D 	 
 
 "#%"(
 
 
 
 
 
 9+++%1
 
 
 	
r?   Frd   r!   input_tensorr"  r(  rD  c           	      $   | j         j        dk    r||dk                                    r|S d S | j         j        dk    r+t          |t          j                  rt          |          }|S ||                                nd}||j        nd}| j         j        dk    r#|s!|st          j
        |||| j                  rd S |j        }|j        d         }	|r|                                }
n/t          |t          j                  r|j        d	         n||	z   dz   }
|                     ||	|
|||j        d         
          }| j         j        dk    r@|>|j        j        dv r0|s.t	          j        |          j        }t          j        ||          }|S )Nflash_attention_2ry   flex_attentionr   Fsdpa)r  r.  is_trainingr   r,   )sequence_lengthtarget_lengthr*   r"  r   )cudaxpunpu)r   _attn_implementationr  r   r6   r{   r"   r%  is_compileabler   _ignore_causal_mask_sdpar1  r*   r3   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrB   rp   r  r  _unmask_unattended)r   rd   r:  r"  r(  rD  past_seen_tokensusing_compilable_cacher*   r@  rA  rN  	min_dtypes                r=   r&  zLongT5Stack._update_causal_mask5  s    ;+/BBB)~/D.I.I.K.K)%%4;+/???.%,77 M!<^!L!L!!
 @O?Z?99;;;`aCRC^!?!?di ;+v55>T5]n5%>*'7 M	    t"&,Q/! 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*.DDD% E E**.I0CKQZ[[Kr?   r@  rA  r*   r   c                    | |                                  dk    r| }nMt          j        |          j        }t          j        ||f|||j                  }|dk    rt          j        |d          }|t          j        ||j                  |                    dd          k    z  }|ddddddf         	                    |ddd          }| |
                                }| j        d         }	|ddddddd|	f         | ddddddf                             |j                  z   }
|
dk    }
|ddddddd|	f                             |
|          |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer*   rB   r   )diagonalrw   r,   r   )r%   r6   r  r  r  rB   triurV   rD   expandr  r3   r`   masked_fill)rd   r@  rA  r*   r"  r   r  rN  rM  mask_lengthpadding_masks              r=   rI  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_positiony  s   > %.*<*<*>*>!*C*C(KKE**.I* -0Ye\j\q  K !###jqAAA5<n>STTTWeWmWmnprsWtWtttK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdDgDg&E E    ,q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r?   r   )NNNNNNNNNNNNNrv  )r   r   r   r   r  r   r   r6   r{   r
   r<  r&  rU  r   r*   rI  r   r   s   @r=   r  r  c  sU            4+ + +
 "#!!q
 q
 q
 q
t #(B BelK78B lB 	B
 B  B B B BH 444 4 {	4
 4 4 4 4 \4 4 4 4 4r?   r  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c            &       :    e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
d
 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f"d            Z xZS )r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    t                                          |           t          j        |j        |j                  | _        t          j        |          }d|_	        d|_
        d|_        t          || j                  | _        t          j        |          }d|_	        d|_        |j        |_        t          || j                  | _        |                                  d S )NFT)r   r   r   r   r  r   r  copydeepcopyr   rC  tie_encoder_decoderr  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   s       r=   r   zLongT5Model.__init__  s       l6#4fnEEv..$)!#( -2*">4;??v..$(!-2*$*$=!">4;?? 	r?   c                     | j         S r   r  r   s    r=   get_input_embeddingsz LongT5Model.get_input_embeddings  
    {r?   c                 |    || _         | j                            |           | j                            |           d S r   r  r^  r  r`  r  s     r=   r  z LongT5Model.set_input_embeddings  ;    $)).999)).99999r?   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r   r   r  r  r^  r  r  r`  rf  s    r=   _tie_weightszLongT5Model._tie_weights  \    ;* 	O&&t|'@$+NNN&&t|'@$+NNNNN	O 	Or?   c                     | j         S r   r^  rf  s    r=   get_encoderzLongT5Model.get_encoder  
    |r?   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsr^  r  	attentionr  r   heads_to_pruner  r  s       r=   _prune_headszLongT5Model._prune_heads  U    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr?   Nr  rd   r  r  r*  decoder_head_maskr+  encoder_outputsr(  r  decoder_inputs_embedsrC  rD  r!  r  r"  r'   c                    ||n| j         j        }||n| j         j        }|=|;| j         j        | j         j        k    r!t          j        t          t                     |}|| 	                    |||
||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }|                     ||||	|||||||||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j                  S )	ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  rd   r  r*  rD  r!  r  r   r   rf   r  r   r  r  rd   r  r(  r  r  r*  r+  rC  rD  r!  r  r"  )r  r(  decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater  encoder_attentions)r   rC  r"  r  r_  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr^  r   r   r   r`  r   r  r(  r   r  r   )r   r  rd   r  r  r*  r}  r+  r~  r(  r  r  rC  rD  r!  r  r"  r   decoder_outputss                      r=   r   zLongT5Model.forward  s   b "+!6IIDK<Q	%0%<kk$+B]  %6%>{%)GGG5}EEE$-! ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (* ,,'1/+"/#1'!5/!5#) ' 
 
   	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r?   )NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpectedr  r    r   rg  r  rn  rr  r{  r   r   r6   
LongTensorFloatTensor
BoolTensorr{   rN   r
   r<  r   r   r   r   r   s   @r=   r  r    sP        	R*& 89VW|      &  : : :
O O O
  C C C  156:8<=A159=7;EI+/048<$(,0/3&*59#J
 J
E,-J
 !!23J
 $E$45	J

 !))9 :J
 E-.J
 $E$56J
 'u|4J
 "%e.?(@"ABJ
 "%J
  -J
  (5J
 D>J
 $D>J
 'tnJ
  d^!J
" !!12#J
$ 
uU&');;	<%J
 J
 J
 ^J
 J
 J
 J
 J
r?   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc            (       f    e Zd ZdgZg dZdef fdZd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         deej                 deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f$d            Zdej        fdZ xZS )r  rW  )rX  rY  zlm_head.weightr   c                 4   t                                          |           |j        | _        t	          j        |j        |j                  | _        t          j	        |          }d|_
        d|_        d|_        t          || j                  | _        t          j	        |          }d|_
        d|_        |j        |_        t          || j                  | _        t	          j        |j        |j        d          | _        |                                  d S )NFTr   )r   r   r   	model_dimr   r   r  r  r[  r\  r   rC  r]  r  r^  r_  r  r`  r   r  r  ra  s       r=   r   z'LongT5ForConditionalGeneration.__init__  s       l6#4fnEEv..$)!#( -2*">4;??v..$(!-2*$*$=!">4;??y1BOOO 	r?   c                     | j         S r   re  rf  s    r=   rg  z3LongT5ForConditionalGeneration.get_input_embeddings  rh  r?   c                 |    || _         | j                            |           | j                            |           d S r   rj  r  s     r=   r  z3LongT5ForConditionalGeneration.set_input_embeddings  rk  r?   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r   rm  rf  s    r=   rn  z+LongT5ForConditionalGeneration._tie_weights  ro  r?   c                     | j         S r   rq  rf  s    r=   rr  z*LongT5ForConditionalGeneration.get_encoder  rs  r?   Nr  rd   r  r  r*  r}  r+  r~  r(  r  r  labelsrC  rD  r!  r  r"  r'   c                 V   ||n| j         j        }||n| j         j        }|=|;| j         j        | j         j        k    r!t          j        t          t                     |}|| 	                    |||
||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }||||                     |          }|                     ||||	|||||||||          }|d         }| j         j        r|| j        dz  z  }|                     |          }d}|pt%          d	
          }|                    |j                  } ||                    d|                    d                    |                    d                    }|s|f|dd         z   |z   }||f|z   n|S t/          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr  r   r   rf   r  r  r  r  )ignore_indexr,   )	losslogitsr(  r  r  r   r  r  r  )r   rC  r"  r  r_  r  r  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  r^  r   r   r   r  r`  r  r  r  r   r`   rB   r2  r#  r   r(  r   r  r   r  )r   r  rd   r  r  r*  r}  r+  r~  r(  r  r  r  rC  rD  r!  r  r"  r   r  sequence_output	lm_logitsr  loss_fctoutputs                            r=   r   z&LongT5ForConditionalGeneration.forward  s   j "+!6IIDK<Q	%0%<kk$+B]  %6%>{%)GGG5}EEE$-! ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (*"3";@U@] $ 1 1& 9 9 ,,'1/+"/#1'!5/!5#) ' 
 
  *!,;* 	G .1EFOLL11	'T:::HYYy/00F8INN2y~~b/A/ABBFKKPROOTTD  	F\OABB$77/IF)-)9TGf$$vE+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r?   c                 ,    |                      |          S r   )r  )r   r  s     r=   %prepare_decoder_input_ids_from_labelszDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsV  s      (((r?   )NNNNNNNNNNNNNNNNN)r   r   r   r  r  r    r   rg  r  rn  rr  r   r   r6   r  r  r  r{   rN   r
   r<  r   r   r   r  r   r   s   @r=   r  r  {  so        	R*& jii|      .  : : :
O O O
    156:8<=A159=7;@D+/59=A-1$(,0/3&*59%f
 f
E,-f
 !!23f
 $E$45	f

 !))9 :f
 E-.f
 $E$56f
 'u|4f
 "%el(;"<=f
 "%f
   12f
  ((9:f
 )*f
 D>f
 $D>f
  'tn!f
" d^#f
$ !!12%f
& 
uU&'8	9'f
 f
 f
 ^f
P)EL ) ) ) ) ) ) ) )r?   r  c                   8    e Zd ZdgZdgZdef fdZd Zd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee         dee         dee         deeej                 ef         fd            Z xZS )r  rX  r`  r   c                 2   t                                          |           t          j        |j        |j                  | _        t          j        |          }d|_	        d|_
        t          || j                  | _        |                                  d S )NF)r   r   r   r   r  r   r  r[  r\  rC  r]  r  r^  r  )r   r   rb  r   s      r=   r   zLongT5EncoderModel.__init___  s}       l6#4fnEEv..#( -2*">4;?? 	r?   c                     | j         S r   re  rf  s    r=   rg  z'LongT5EncoderModel.get_input_embeddingsk  rh  r?   c                 H    || _         | j                            |           d S r   )r  r^  r  r  s     r=   r  z'LongT5EncoderModel.set_input_embeddingsn  s%    $)).99999r?   c                 l    | j         j        r'|                     | j        j        | j                   d S d S r   )r   r  r  r^  r  r  rf  s    r=   rn  zLongT5EncoderModel._tie_weightsr  s?    ;* 	O&&t|'@$+NNNNN	O 	Or?   c                     | j         S r   rq  rf  s    r=   rr  zLongT5EncoderModel.get_encoderv  rs  r?   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS ru  rv  ry  s       r=   r{  zLongT5EncoderModel._prune_headsy  r|  r?   Nr  rd   r*  r  rD  r!  r  r'   c           	      ^    ||n| j         j        }|                     |||||||          }|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r   r"  r^  )	r   r  rd   r*  r  rD  r!  r  r~  s	            r=   r   zLongT5EncoderModel.forward  sM    F &1%<kk$+B],,)'/!5# ' 
 
 r?   )NNNNNNN)r   r   r   r  r  r    r   rg  r  rn  rr  r{  r   r   r6   r  r  r<  r   rN   r   r   r   r   s   @r=   r  r  Z  sn       78*4&
| 
 
 
 
 
 
  : : :O O O  C C C  156:1559,0/3&*. .E,-. !!23. E-.	.
   12. $D>. 'tn. d^. 
uU&'8	9. . . ^. . . . .r?   r  )r  r  r  r  )r   )\r  r[  r  r  typingr   r   r   r6   r   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_longt5r    !torch.nn.attention.flex_attentionr!   integrations.flex_attentionr"   
get_loggerr   r   r{   r   r>   rG   rT   r\   rc   rB   rk   rN   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionr  r   r   r   r   rW  ry  r  r  r  r  r  r  r  __HEAD_MASK_WARNING_MSGr  r  r  __all__r-   r?   r=   <module>r     s*        ' ' ' ' ' ' ' ' ' '        % % % % % % ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) > > > > > > 9 9 9 9 9 9            . - - - - - Q Q Q Q Q Q Q Q                  1 0 0 0 0 0 . . . . . .  !! K;;;;;;JJJJJJ 
	H	%	%   3 3 W\Wc     #%, #3 #S #U\ # # # #4 4U\ 4c 4 4Y\ 4ejeq 4 4 4 42!# !%, ! ! ! !BU\ Bc BV[Vb B B B B8el 8s 8TYT` 8ejeq 8 8 8 8 .PL.P58.P
5<%&.P .P .P .Pb4U\ 4VY 4^c^j 4 4 4 4	j<	j,1L	jJM	j
\	j 	j 	j 	j+ + + + +bi + + +2	//////"O
KKeffff 	 	 	D 	 	 	
NN[\\\D	    ")   ,    ry   &    BI   &f f f f fbi f f fR    29   DC C C C CRY C C CN" " " " "ry " " "J    BI   >    bi   D$ $ $ $ $	 $ $ $N]
 ]
 ]
 ]
 ]
, ]
 ]
 ]
@ {! {! {! {! {!O {! {! {!|L L L L L' L L L`
  |
 |
 |
 |
 |
' |
 |
 |
~   
W) W) W) W) W)%:O W) W) 
W)t U U U U U. U U Up k
j
js   F/ /G6GG