
     `iz                     D   d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 dZ5 e$j6        e7          Z8 G d de+          Z9 G d de          Z: G d de0          Z; G d de.          Z< G d de1          Z= G d de-          Z> G d d e-          Z?d!eej@                 d"efd#ZAd$eBd"efd%ZC G d& d'e          ZD G d( d)eD          ZE G d* d+ejF                  ZG G d, d-ejF                  ZHe" G d. d/e/                      ZId0eejJ                 d1ej@        d2eeB         d"ej@        fd3ZK G d4 d5eI          ZL G d6 d7eL          ZMe" G d8 d9eI                      ZNe" G d: d;eI                      ZO G d< d=eIe          ZPe" G d> d?eI                      ZQe" G d@ dAeI                      ZRg dBZSdS )C    )AnyCallableOptionalUnionN   )CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forwardz google/t5gemma-2b-2b-prefixlm-itc                       e Zd ZdS )T5GemmaModuleConfigN__name__
__module____qualname__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr*   r*   ?           Dr0   r*   c                   z    e Zd ZdZdZdgZi dddddddd	d
ddddd	dddddddd	dddddddd	dddddd	iZdgdgfddgdgfdgdgfdgdgfddgdgfdgdgfdZ	 	 	 	 	 	 	 	 d+d!ee	e
eeef         f                  d"ee	e
eeef         f                  d#ed$ed%ed&ed'ed(ef fd)Z fd*Z xZS ),T5GemmaConfiga  
    This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
    >>> model = T5GemmaModel(t5gemma_config)
    ```
    Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PretrainedConfig] for more information.
    Args:
        encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether tie input and output embeddings.
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5Gemma model (the same as Gemma 2).
        kwargs (additional keyword arguments, optional, *optional*):
            Will be passed to the PretrainedConfig base class.
    t5gemmapast_key_valuesz!encoder.layers.*.self_attn.q_projcolwisez!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.o_projrowwisezencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projz!decoder.layers.*.self_attn.q_projz!decoder.layers.*.self_attn.k_projz!decoder.layers.*.self_attn.v_projz!decoder.layers.*.self_attn.o_projz"decoder.layers.*.cross_attn.q_projz"decoder.layers.*.cross_attn.k_projz"decoder.layers.*.cross_attn.v_projz"decoder.layers.*.cross_attn.o_projzdecoder.layers.*.mlp.gate_projzdecoder.layers.*.mlp.up_projzdecoder.layers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)zencoder.embed_tokenszencoder.layerszencoder.normzdecoder.embed_tokenszdecoder.layerszdecoder.normNT          encoderdecoderis_encoder_decoderdropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddings
vocab_sizec	                    t          |t                    rt          di |}n@|t                      }n/t          |t                    sJ t          |           d            t          |t                    rt          di |}n4||}n/t          |t                    sJ t          |           d            t          di |                                }t          di |                                }d|_        ||_        ||_        || _        d|_        d|_	        ||_        ||_        |j
        |_        || _        dD ]}
|
|	vrt          ||
          |	|
<    t                      j        di |	 || _        |	                    d|j	                  | _	        |	                    d|j                  | _        || _        || _        || _        || _        || _        d S )Nz is not supported.FT)bos_token_idpad_token_ideos_token_id	use_cacheinitializer_ranger/   )
isinstancedictr*   typeto_dict
is_decoderrB   rD   r?   rK   hidden_sizecross_attention_hidden_sizer@   getattrsuper__init__rA   getrL   rC   rE   rF   )selfr?   r@   rA   rB   rC   rD   rE   rF   kwargsspecial_token_key	__class__s              r1   rV   zT5GemmaConfig.__init__   s    gt$$ 	b)44G44GG_)++GGg':;;aaW=a=a=aaa;gt$$ 	b)44G44GG_GGg':;;aaW=a=a=aaa;%::(9(9::%::(9(9::"+$5!! +$5!.5.A+!Q 	P 	P ..,3G=N,O,O()""6""""4K1BCC!',?AZ![![(!2'>$#6  %r0   c                     g d}||v r,t          | j        ||           t          | j        ||           t                                          ||           d S )N)output_hidden_statesoutput_attentions_attn_implementationrB   rD   rF   )setattrr?   r@   rU   __setattr__)rX   keyvalueshared_attr_with_submodulesr[   s       r1   ra   zT5GemmaConfig.__setattr__   sk    '
 '
 '
# ---DL#u---DL#u---C'''''r0   )NNTr=   r=   r=   Tr>   )r,   r-   r.   __doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   r   r*   rN   r   boolfloatintrV   ra   __classcell__r[   s   @r1   r4   r4   C   s[        B J#4"5+Y 	,Y 	,Y	
 	,Y 	)) 	'	 	)) 	,Y 	,Y 	,Y 	,Y 	-i 	-i  	-i!" 	-i#$ 	))%& 	'	'( 	))) 0 #.0A B+-=>@QR)*_,=>"-0A B+-=>@QR)*_,=>	 	 IMHL#'!),#&$( 8% 8%% 3T#s(^ CDE8% % 3T#s(^ CDE8% !	8%
 8% "'8% !8% "8% 8% 8% 8% 8% 8% 8%t( ( ( ( ( ( ( ( (r0   r4   c                       e Zd ZdS )T5GemmaRMSNormNr+   r/   r0   r1   rp   rp      r2   r0   rp   c                   $     e Zd Z fdZd Z xZS )
T5GemmaMLPc                     t                                          |           t          j        |j                  | _        d S N)rU   rV   nnDropoutrB   dropoutrX   configr[   s     r1   rV   zT5GemmaMLP.__init__   s3       z&"566r0   c                     |                      |                     |                    |                     |          z  }|                     |          }|                     |          }|S rt   )act_fn	gate_projup_projrw   	down_proj)rX   xr;   r~   s       r1   forwardzT5GemmaMLP.forward   sV    DNN1$5$566aH]33NN=11	r0   )r,   r-   r.   rV   r   rm   rn   s   @r1   rr   rr      sG        7 7 7 7 7      r0   rr   c                         e Zd Zd fd	Z xZS )T5GemmaRotaryEmbeddingNc                 L    t                                          ||           d S rt   )rU   rV   )rX   ry   devicer[   s      r1   rV   zT5GemmaRotaryEmbedding.__init__   s#    (((((r0   rt   )r,   r-   r.   rV   rm   rn   s   @r1   r   r      s=        ) ) ) ) ) ) ) ) ) )r0   r   c                   (     e Zd Zdedef fdZ xZS )T5GemmaSelfAttentionry   	layer_idxc                 d    t                                          ||           |j        | _        d S rt   )rU   rV   rQ   	is_causalrX   ry   r   r[   s      r1   rV   zT5GemmaSelfAttention.__init__   s+    +++*r0   )r,   r-   r.   r*   rl   rV   rm   rn   s   @r1   r   r      sL        +2 +s + + + + + + + + + +r0   r   c                       e Zd Zdedef fdZ eddd          	 dd	ej        d
e	ej                 de	ej                 de	e
         dee         deej        e	ej                 e	eej                          f         fd            Z xZS )T5GemmaCrossAttentionry   r   c                 V   t                                          ||           | `d| _        |j        t          d          t          j        |j        |j        | j	        z  |j
                  | _        t          j        |j        |j        | j	        z  |j
                  | _        d S )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rU   rV   sliding_windowr   rS   
ValueErrorru   Linearnum_key_value_headshead_dimattention_biask_projv_projr   s      r1   rV   zT5GemmaCrossAttention.__init__   s    +++-5abbbi.0JT]0Zagav
 
 
 i.0JT]0Zagav
 
 
r0   past_key_valuer6   4.58new_nameversionNr;   r<   encoder_hidden_statesrY   returnc                 @   |t          d          |j        d d         }g |d| j        R }|                     |                              |                              dd          }|&|j                            | j                  }	|j	        }
||	s|j        d d         }g |d| j        R }| 
                    |                              |                              dd          }|                     |                              |                              dd          }|.|
                    ||| j                  \  }}d|j        | j        <   n.|
j        | j                 j        }|
j        | j                 j        }t           }| j        j        dk    rt&          | j        j                 } || ||||f| j        r| j        nd| j        d | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )	Nz5Encoder hidden state is required for cross attention.   r   Teagerr=   )rw   scalingr   softcap)r   shaper   q_projview	transpose
is_updatedrW   r   cross_attention_cacher   r   updatelayerskeysvaluesr(   ry   r_   r   trainingrD   r   attn_logit_softcappingreshape
contiguouso_proj)rX   r;   r<   r   r6   rY   input_shapehidden_shapequery_statesr   curr_past_key_valueencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                     r1   r   zT5GemmaCrossAttention.forward   s\    !(TUUU#)#2#.88b8$-88{{=1166|DDNNqRSTT&(377GGJ"1"G"*""7"=crc"B#L%8#L"#Ldm#L#L %:;;@@AUVV``abdeffJ;;'<==BBCWXXbbcdfghhL*+>+E+EjR^`d`n+o+o(
L=A*4>:,3DNCHJ.5dnELL(?;+w66"9$+:Z"[$7$7%
 /3mDD**L/%
 %
 %
 %
!\ *k);;;;;;FFHHkk+..L((r0   rt   )r,   r-   r.   r*   rl   rV   r   torchTensorr   r   r   r   tupler   rm   rn   s   @r1   r   r      s        
2 
s 
 
 
 
 
 
 _%0A6RRR ,03) 3)|3) !.3)  (5	3)
 "%3) -.3) 
u|Xel3XeEL>Q5RR	S3) 3) 3) SR3) 3) 3) 3) 3)r0   r   r<   r   c           
      Z     dt           dt           dt           dt           dt          f
 fd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr   c                      t          j        dt           j                  S | |f                             t           j                  S )Nr/   dtype)r   onesrj   to)r   r   r   r   r<   s       r1   
inner_maskz/bidirectional_mask_function.<locals>.inner_mask:  s@    !:b
3333i/033EJ???r0   rl   rj   )r<   r   s   ` r1   bidirectional_mask_functionr   5  sW    
@c @S @ @c @d @ @ @ @ @ @
 r0   r   c           
      Z     dt           dt           dt           dt           dt          f
 fd}|S )zH
    This creates bidirectional attention mask with sliding window.
    r   r   r   r   r   c                 *    |z
  |k     ||z   k     z  S rt   r/   )r   r   r   r   r   s       r1   r   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskG  s"    &/FU^=S4STTr0   r   )r   r   s   ` r1   *sliding_window_bidirectional_mask_functionr   B  sW    
Uc US U Uc Ud U U U U U U r0   c                        e Zd ZdZdef fdZ	 	 ddej        deej        ej        f         de	ej                 de	ej
                 d	eej        f         f
d
Z xZS )T5GemmaEncoderLayerzEncoder sub-layer.r   c                 0   t                                                       |j        | _        || _        || _        |j        |         | _        t          ||          | _        t          |j        |j
                  | _        t          |j        |j
                  | _        t          |          | _        t          |j        |j
                  | _        t          |j        |j
                  | _        t#          j        |j                  | _        d S N)ry   r   eps)rU   rV   rR   ry   r   layer_typesattention_typer   	self_attnrp   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrr   mlppre_feedforward_layernormpost_feedforward_layernormru   rv   rB   rw   r   s      r1   rV   zT5GemmaEncoderLayer.__init__P  s    !-"$0;-
 
 
 (6f6HfNa'b'b'b$(6v7IvOb(c(c(c%f%%)78JPVPc)d)d)d&*89KQWQd*e*e*e'z&"566r0   Nr;   position_embeddingsr<   position_idsr   c           	      l   |}|                      |          } | j        d||||d d|\  }}|                     |          }||                     |          z   }|}|                     |          }|                     |          }|                     |          }||                     |          z   }|S )N)r;   r   r<   r   r6   r/   )r   r   r   rw   r   r   r   )rX   r;   r   r<   r   rY   residual_s           r1   r   zT5GemmaEncoderLayer.forwardd  s     !44]CC)4> 
' 3)% 
 
 
 
q 55mDD 4<<#>#>> 66}EE//77FF 4<<#>#>>r0   )NN)r,   r-   r.   re   rl   rV   r   r   r   r   
LongTensorFloatTensorr   rm   rn   s   @r1   r   r   M  s        7# 7 7 7 7 7 70 2637 | #5<#=> !.	
 u/0 
u !	"       r0   r   c                   N    e Zd ZdZdef fdZ eddd          	 	 	 	 	 	 	 dd
ej        de	ej        ej        f         de
ej                 de
ej                 de
e         de
e         de
ej                 de
ej                 de
ej                 dej        fd            Z xZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                     t                                          ||           t          ||          | _        t	          |j        |j                  | _        t	          |j        |j                  | _        d S r   )	rU   rV   r   
cross_attnrp   rR   r   pre_cross_attn_layernormpost_cross_attn_layernormr   s      r1   rV   zT5GemmaDecoderLayer.__init__  sn    +++/vSSS(6v7IvOb(c(c(c%)78JPVPc)d)d)d&&&r0   r   r6   r   r   NFr;   r   r<   r   rK   cache_positionr   encoder_attention_maskr   c
                 6   |}|                      |          } | j        d||||||j        nd ||d|
\  }}|                     |          }||                     |          z   }|}|                     |          } | j        d|||	||d|
\  }}|                     |          }||                     |          z   }|}|                     |          }| 	                    |          }| 
                    |          }||                     |          z   }|S )N)r;   r   r<   r   r6   rK   r   )r;   r   r<   r6   rK   r/   )r   r   self_attention_cacher   rw   r   r   r   r   r   r   )rX   r;   r   r<   r   r6   rK   r   r   r   rY   r   r   s                r1   r   zT5GemmaDecoderLayer.forward  sl    !44]CC)4> 	
' 3)%DSD_O@@ei)	
 	
 	
 	
q 55mDD 4<<#>#>> 55mDD*4? 
'"71+
 
 
 
q 66}EE 4<<#>#>> 66}EE//77FF 4<<#>#>>r0   )NNNFNNN)r,   r-   r.   re   rl   rV   r   r   r   r   r   r   r
   rj   r   r   rm   rn   s   @r1   r   r     sF       <<e# e e e e e e _%0A6RRR
 26379=$)598<9=. .|. #5<#=>. !.	.
 u/0. ""56. D>. !!12.  (5. !) 6. 
	. . . SR. . . . .r0   r   c                   V     e Zd ZdZd
dededef fdZdej        dej        fd	Z	 xZ
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r=   rR   
num_labelsrC   c                     t                                                       t          j        |          | _        t          j        ||          | _        d S )N)p)rU   rV   ru   rv   rw   r   out_proj)rX   rR   r   rC   r[   s       r1   rV   z"T5GemmaClassificationHead.__init__  sE    z$;<<<	+z::r0   r;   r   c                 Z    |                      |          }|                     |          }|S rt   )rw   r   )rX   r;   s     r1   r   z!T5GemmaClassificationHead.forward  s*    ]33m44r0   )r=   )r,   r-   r.   re   rl   rk   rV   r   r   r   rm   rn   s   @r1   r   r     s        77; ;C ;S ;SX ; ; ; ; ; ;
U\ el        r0   r   c                   V     e Zd ZdZd
dededef fdZdej        dej        fd	Z	 xZ
S )T5GemmaLMHeadz.Head for language modeling (generation) tasks.FrR   rF   r   c                     t                                                       t          j        |||          | _        d S )Nr   )rU   rV   ru   r   r   )rX   rR   rF   r   r[   s       r1   rV   zT5GemmaLMHead.__init__  s5    	+zEEEr0   r;   r   c                 0    |                      |          }|S rt   )r   )rX   r;   logitss      r1   r   zT5GemmaLMHead.forward  s    }--r0   )F)r,   r-   r.   re   rl   rj   rV   r   r   r   rm   rn   s   @r1   r   r     s        88F FC FS F F F F F F FU\ el        r0   r   c                   6    e Zd ZU eed<   dZdZddgZd Zd Z	dS )	T5GemmaPreTrainedModelry   modelTr   r   c                    t          j        | |           | j        j        }t	          |t
                    r|j        j        j        d         dz  }|j        j        j	        
                    d||z             t          |j        d          r1|j        j        '|j        j        j	                                         d S d S d S t	          |t                    rS| j        j        sE|j        j        j        d         dz  }|j        j        j	        
                    d||z             d S d S d|j        j        v r |j        j	                                         d S d S )Nr   g      r=   )meanstdr   RMSNorm)r   _init_weightsry   rL   rM   r   r   weightr   datanormal_hasattrr   zero_r   rE   r[   r,   )rX   moduler  scales       r1   r  z$T5GemmaPreTrainedModel._init_weights  sb   %dF333k+f788 	'O*03t;EO"'//ScEk/JJJv// 2FO4H4T$)//111112 24T4T.. 	';2 O.4Q74?&+33#+3NNNNNO O &*333M$$&&&&& 43r0   c                 J   | j         j        j        }| j         j        j        }|t	          d          |                    |j                  }|dddf                                         |dddf<   ||d<   |t	          d          |                    |dk    |           |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	ry   r@   rH   rI   r   	new_zerosr   clonemasked_fill_)rX   r9   decoder_start_token_idrI   shifted_input_idss        r1   _shift_rightz#T5GemmaPreTrainedModel._shift_right  s     "&!4!A{*7!)YZZZ &//	@@%.sCRCx%8%>%>%@%@#qrr'"$:&!XYYY 	&&'8D'@,OOO  r0   N)
r,   r-   r.   r4   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr  r  r/   r0   r1   r   r     sX         &*#.0EF' ' '"! ! ! ! !r0   r   	token_idsr;   rI   c                     | ;|t          d          | |k                        |j        t          j                  }n>t          j        |j        d         |j        d         f|j        t          j                  }|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r   r   r   )r   r   r   r   longr   r   )r  r;   rI   r<   s       r1   make_default_2d_attention_maskr  	  s     RSSS#|3778LejYY #]%8%;<]EYafak
 
 
 r0   c                        e Zd ZeedZ fdZe	 	 	 	 ddee	j
                 dee	j                 dee	j
                 dee	j                 dee         d	efd
            Z xZS )T5GemmaEncoder)
attentionsr;   c                    t                                                     j        | _        j        | _        t          j        j        j        | j                  | _        t          j        j
                  | _        t                    | _        d| _        t          j        fdt!          j                  D                       | _        t          j        j                  | _        |                                  d S )Nr   ry   Fc                 0    g | ]}t          |          S r/   )r   .0r   ry   s     r1   
<listcomp>z+T5GemmaEncoder.__init__.<locals>.<listcomp>+  $    eee	 33eeer0   )rU   rV   rI   padding_idxrF   ru   	EmbeddingrR   embed_tokensrp   r   normr   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layersr   rv   rB   rw   	post_initrx   s    `r1   rV   zT5GemmaEncoder.__init__   s       !. +L):F<NPTP`aa"6#56;NOOO	0???&+#meeeeU6KcEdEdeee
 
 z&"566 	r0   Nr9   r<   r   r:   rY   r   c           	         |d u |d uz  rt          d          |                    dd            ||                     |          }t          j        d|j        d         |j                  }||                    d          }|t          ||| j	        j
                  }t          |x}t                    sa| j	        |||d |d}t          di |dt          |          it          di |t!          | j	        j                  t          |          dd	}|}	|                     |	|          }
t          j        | j	        j        d
z  |	j                  }|	|z  }	|                     |	          }	| j        d | j	        j                 D ]} ||	|
||j                 |fi |}	|                     |	          }	|                     |	          }	t7          |	          S )N:You must specify exactly one of input_ids or inputs_embedsr6   r   r   r   ry   input_embedsr<   r   r6   r   or_mask_function)r6  and_mask_functionfull_attentionsliding_attention      ?r   )last_hidden_stater/   )r   popr)  r   aranger   r   	unsqueezer  ry   rI   rM   rN   r&   r   r'   r   r   r+  tensorrR   r   rw   r   r/  r   r*  r   )rX   r9   r<   r   r:   rY   r   self_attn_mask_mappingmask_kwargsr;   r   
normalizerlayer_modules                r1   r   zT5GemmaEncoder.forward2  sc    -t";< 	[YZZZ 	

$d+++  --i88Ma)<Q)?H\]]])33A66L!;I}VZVaVnooNNB0DII 	+ -"0"0#' , K #5 # #!# #%@%P%P# # # &G & &!&%OPTP[Pj%k%k&A.&Q&Q& & & &
& 
&" &"oom\JJ\$+"93">mFYZZZ
%
2]33 K(G$+*G(GH 	 	L(L#&|'BC	 
  MM 		-00]33+
 
 
 	
r0   NNNN)r,   r-   r.   r   r   _can_record_outputsrV   r   r   r   r   r   r   r   r   r   r   rm   rn   s   @r1   r  r    s        *, 
    $  15153759A
 A
E,-A
 !.A
 u/0	A

   12A
 +,A
 
A
 A
 A
 A
 A
 A
 A
 A
r0   r  c                   l    e Zd Z eed           eed          edZ fdZe		 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej                 d	e
e         d
e
ej                 de
e         de
ej                 de
ej                 de
ej                 dee         defd            Z xZS )T5GemmaDecoderr   )index)r  cross_attentionsr;   c                     t                                                     t          j        fdt	          j                  D                       | _        |                                  d S )Nc                 0    g | ]}t          |          S r/   )r   r#  s     r1   r%  z+T5GemmaDecoder.__init__.<locals>.<listcomp>  r&  r0   )rU   rV   ru   r-  r.  r/  r   r0  rx   s    `r1   rV   zT5GemmaDecoder.__init__~  si       meeeeU6KcEdEdeee
 
 	r0   Nr9   r<   r   r6   r:   rK   r   r   r   rY   r   c
                    |d u |d uz  rt          d          |t          d          ||                     |          }| j        s:|r8|6t          t	          | j                  t	          | j                            }|B||                                nd}t          j        |||j	        d         z   |j
                  }||                    d          }||t          ||| j        j                  }t          |x}t                    s0| j        |||||j        nd |d}t#          di |t%          di |d}t          |	x}t                    s-| j        ||	|d d d}d	t#          di |d
t'          |	          ii}|}|                     ||          }t          j        | j        j        dz  |j                  }||z  }|                     |          }| j        d | j        j                 D ]$} |||||j                 ||||||d	         f	i |
}%|                     |          }|                     |          }t;          ||          S )Nr2  z0`encoder_hidden_states` must be given in decoderr!  r   r   r3  r4  r8  r9  r6  r;  r   )r<  r6   r/   )r   r)  r   r
   r	   ry   get_seq_lengthr   r>  r   r   r?  r  rI   rM   rN   r   r&   r'   r   r+  r@  rR   r   rw   r   r/  r   r*  r   )rX   r9   r<   r   r6   r:   rK   r   r   r   rY   past_seen_tokensrA  rB  cross_attn_mask_mappingr;   r   rC  rD  s                      r1   r   zT5GemmaDecoder.forward  s%    -t";< 	[YZZZ (OPPP  --i88M} 	v 	v/F1,dk2R2R2RT`hlhsTtTtTtuuO!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L!o&=;I}VZVaVnooNNB0DII 	+ -"0"0KZKf?#G#Glp , K #5"C"C{"C"C%F%U%U%U%U& &"
 5KK1TRR 	+ 5"8"0#' $ K !"4 # #!# #%@AW%X%X# # #'# &"oom\JJ\$+"93">mFYZZZ
%
2]33 K(G$+*G(GH 	 	L(L#&|'BC%'(89   MM 		-00]338++
 
 
 	
r0   )	NNNNNNNNN)r,   r-   r.   r   r   r   r   rF  rV   r   r   r   r   r   r
   r   rj   r   r   r   r   rm   rn   s   @r1   rH  rH  w  sv       $n%9CCC*N+@JJJ,       1515379=59$(598<9=Z
 Z
E,-Z
 !.Z
 u/0	Z

 ""56Z
   12Z
 D>Z
 !!12Z
  (5Z
 !) 6Z
 +,Z
 
3Z
 Z
 Z
 Z
 Z
 Z
 Z
 Z
r0   rH  c                       e Zd Zdef fdZd Zd Zd Zee		 	 	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
e         de
e         de
ej                 de
ej                 de
e         de
ej                 dee         defd                        Z xZS )T5GemmaModelry   c                    t                                          |           |j        st          d          t	          |j                  | _        t          |j                  | _        |                                  d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rU   rV   rA   r   r  r?   rH  r@   r0  rx   s     r1   rV   zT5GemmaModel.__init__  sn       ( 	wuvvv%fn55%fn55r0   c                     | j         S rt   r?   rX   s    r1   get_encoderzT5GemmaModel.get_encoder  s
    |r0   c                 4    | j                                         S rt   r?   get_input_embeddingsrV  s    r1   rZ  z!T5GemmaModel.get_input_embeddings      |00222r0   c                 6    | j                             |          S rt   r?   set_input_embeddingsrX   new_embeddingss     r1   r^  z!T5GemmaModel.set_input_embeddings      |00@@@r0   Nr9   r<   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr6   r:   decoder_inputs_embedsrK   r   rY   r   c                     | | j         d||||	d|}|j        } | j        d||||
|||||d	|}t          |j        |j        |                    dd          r|j        n|j        f|j        |j        |j        |j        |j                  S )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr9   r<   r   r:   )	r9   r<   r   r:   r6   r   r   rK   r   r]   F)r<  r6   decoder_hidden_statesdecoder_attentionsrJ  encoder_last_hidden_stater   encoder_attentionsr/   )	r?   r<  r@   r   r6   rW   r;   r  rJ  )rX   r9   r<   r   rb  rc  rd  re  r6   r:   rf  rK   r   rY   r   decoder_outputss                   r1   r   zT5GemmaModel.forward  s    . "*dl #-)+	 
  O !0 A&$, 
'1-/+"7#1)
 
 
 
 "-?+;zz0%88#6/"?"?!35.9,=&5&G"1"?.9
 
 
 	
r0   )NNNNNNNNNNNN)r,   r-   r.   r4   rV   rW  rZ  r^  r   r   r   r   r   r   
BoolTensorr   r
   r   rj   r   r   r   r   rm   rn   s   @r1   rR  rR    s       	} 	 	 	 	 	 	  3 3 3A A A  156:378<=A;?599=048<$(598
 8
E,-8
 !!238
 u/0	8

 $E$458
 !))9 :8
 'u'788
 "/28
 ""568
  -8
  (58
 D>8
 !!128
 +,8
 
8
 8
 8
 ^ 8
 8
 8
 8
 8
r0   rR  c                        e Zd Zdef fdZd Zd Zee	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 d	e	e
j                 d
ee         defd                        Z xZS )T5GemmaEncoderModelry   c                     t                                          |           |j        rt          d          t	          |j                  | _        |                                  d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rU   rV   rA   r   r  r?   r0  rx   s     r1   rV   zT5GemmaEncoderModel.__init__9  s]       $ 	rpqqq%fn55r0   c                 4    | j                                         S rt   rY  rV  s    r1   rZ  z(T5GemmaEncoderModel.get_input_embeddingsB  r[  r0   c                 6    | j                             |          S rt   r]  r_  s     r1   r^  z(T5GemmaEncoderModel.set_input_embeddingsE  ra  r0   Nr9   r<   r   r:   rY   r   c                 *     | j         d||||d|}|S )Nrh  r/   rU  )rX   r9   r<   r   r:   rY   re  s          r1   r   zT5GemmaEncoderModel.forwardH  s?     '$, 
)%'	
 

 
 
 r0   rE  )r,   r-   r.   r4   rV   rZ  r^  r   r   r   r   r   r   r   r   r   r   r   rm   rn   s   @r1   rp  rp  7  s       }      3 3 3A A A  156:3704 E,- !!23 u/0	
  - +, 
   ^     r0   rp  c            %       J    e Zd ZddgZddiZddgdgfiZdef fdZd	 Zd
 Z	d Z
d Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         deej                 deej                 deej                 dee         deej                 deeej        f         dee         deeej                 ef         f d                         Zdej        fd!Z xZ S )#T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_repr;   r   ry   c                     d|_         t                                          |           t          |          | _        |j        j        | _        t          |j        j        | j                  | _	        d| _
        |                                  d S )NTForMaskedLM)rA   rU   rV   rR  r   r@   rF   r   rR   lm_head	loss_typer0  rx   s     r1   rV   z(T5GemmaForConditionalGeneration.__init__a  ss    $(!   !&))
 .3$V^%?QQ&r0   c                     || j         _        d S rt   rz  r   r_  s     r1   set_output_embeddingsz5T5GemmaForConditionalGeneration.set_output_embeddingsl  s     .r0   c                     | j         j        S rt   r}  rV  s    r1   get_output_embeddingsz5T5GemmaForConditionalGeneration.get_output_embeddingso  s    |$$r0   c                     | j         j        rF|                     | j        j        |                                                                            d S d S rt   )ry   rE   _tie_or_clone_weightsrz  r   get_decoderrZ  rV  s    r1   _tie_weightsz,T5GemmaForConditionalGeneration._tie_weightsr  sU    ;* 	i&&t|'<d>N>N>P>P>e>e>g>ghhhhh	i 	ir0   c                     | j         j        S rt   )r   r?   rV  s    r1   rW  z+T5GemmaForConditionalGeneration.get_encoderw      z!!r0   c                     | j         j        S rt   )r   r@   rV  s    r1   r  z+T5GemmaForConditionalGeneration.get_decoderz  r  r0   Nr   r9   r<   r   rb  rc  rd  re  r6   r:   rf  labelsrK   r   logits_to_keeprY   r   c                 F   |||
|                      |          } | j        d|||||||||	|
||d|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }|                                 j        }|j	        (||j	        z  }t          j        |          }||j	        z  }d}| | j        ||| j        fi |}t          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r9   r<   r   rb  rc  rd  re  r6   r:   rf  rK   r   )	lossr   r6   ri  rj  rJ  rk  r   rl  r/   )r  r   r<  rM   rl   slicerz  r  ry   final_logit_softcappingr   tanhloss_functionrF   r   r6   ri  rj  rJ  rk  r   rl  )rX   r9   r<   r   rb  rc  rd  re  r6   r:   rf  r  rK   r   r  rY   rm  r;   slice_indicesr   decoder_configr  s                         r1   r   z'T5GemmaForConditionalGeneration.forward}  s   < "3";@U@] $ 1 1& 9 9.8dj /
)%/#9!5++'"7)/
 /
 /
 /
  (98B>SV8W8Wk~ot444]kmAAA}aaa,?@AA))++21=nDDFZ''FnDDF%4%ffdoPPPPD+;"1"G.A,=&5&O"1"G.A

 

 

 
	
r0   c                 ,    |                      |          S rt   )r  )rX   r  s     r1   %prepare_decoder_input_ids_from_labelszET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      (((r0   )NNNNNNNNNNNNNr   )!r,   r-   r.   _tied_weights_keys_tp_plan_pp_planr4   rV   r~  r  r  rW  r  r   r   r   r   r   r   rn  r   r
   rj   r   rl   r   r   r   r   r   r   r  rm   rn   s   @r1   rv  rv  \  sl       =?XY"M2H"o%6
$CDH	} 	 	 	 	 	 	/ / /% % %i i i
" " "" " "  156:378<=A;?599=59=A-1$(5934I
 I
E,-I
 !!23I
 u/0	I

 $E$45I
 !))9 :I
 'u'78I
 "/2I
 ""56I
   12I
  ((9:I
 )*I
 D>I
 !!12I
 c5</0I
  +,!I
" 
uU&'8	9#I
 I
 I
 ^ I
V)EL ) ) ) ) ) ) ) )r0   rv  c                       e Zd Zddedee         f fdZd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         deej                 deej                 deej                 dee         defd                        Z xZS ) T5GemmaForSequenceClassificationNry   rA   c                    |||_         t                                          |           |j        | _        |j         rt	          |          | _        nt          |          | _        |j        j        }|j         r|j	        j        }t          |dd          }t          || j        |          | _        |                                  dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        NrC   皙?rA   rU   rV   r   rR  r   rp  r?   rR   r@   rT   r   scorer0  rX   ry   rA   rR   classifier_dropoutr[   s        r1   rV   z)T5GemmaForSequenceClassification.__init__  s    
 )(:F%    +$ 	5%f--DJJ,V44DJn0$ 	5 .4K$V-FLL.{DOM_``
r0   c                 4    | j                                         S rt   r   rZ  rV  s    r1   rZ  z5T5GemmaForSequenceClassification.get_input_embeddings      z..000r0   c                 :    | j                             |           d S rt   r   r^  rX   rc   s     r1   r^  z5T5GemmaForSequenceClassification.set_input_embeddings      
''.....r0   r9   r<   r   rb  rc  rd  re  r:   rf  r  rY   r   c                    | j         j        r!||t          d| j        j         d          | j         j        r*|(|	&|t          d          |                     |          }| j         j        r. | j        |f||||||||	dd	|}|j        }|j	        }|j
        }n' | j        |f|||d|}|j        }|j        }|j        }|                     |          }||j        d         }n|j        d         }| j         j        |d	k    rt          d
          | j         j        d}n||| j         j        k                        |j        t$          j                  }t%          j        |j        d         |j        t$          j                  }||z                      d          }| j         j        r)|d	z  }t%          j        ||j        d         d	z
            }n)d}t.                              | j        j         d           |t%          j        ||j                  |f         }d}|
|                     ||
|| j                   }t5          ||||          S )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r<   r   rb  rc  rd  re  r:   rf  rK   r<   r   r:   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r3  )r   r  pooled_logitsry   r  r   r;   r  )ry   rA   NotImplementedErrorr[   r,   r   r  r   r<  ri  rj  r;   r  r  r   rI   r   r   r   int32r>  argmaxclamploggerwarning_oncer  r   )rX   r9   r<   r   rb  rc  rd  re  r:   rf  r  rY   outputsr<  r;   r  r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  r  s                          r1   r   z(T5GemmaForSequenceClassification.forward  s   2 ;) 	y/@]E^%}4>Kb}}}  
 ;) 	=/@/HMbMj  U  
 !% 1 1) < <;) 	,*4$*+-)"3'=%9 /+&;+ + + +G !( 9#9M 3JJ'1tz(-)+	( (
 ( (G !( 9#1M +J-.. "+JJ&,Q/J;#+
a\]]];#+!#"%)AAEEfmUZU`aaL!L)<V]Z_ZefffM"/,">!F!Fr!J!J{- j"a'"%*[1CIZI`acIdghIh%i%i%i"!#>* Z Z Z  
 u|Jv}MMMOaab%%VFR_hlhs%ttD' '!	
 
 
 	
r0   rt   
NNNNNNNNNN)r,   r-   r.   r4   r   rj   rV   rZ  r^  r   r   r   r   r   r   r   r   r   r   r   rm   rn   s   @r1   r  r    s        } (4.      .1 1 1/ / /  1515378<9=;?5959=A-1i
 i
E,-i
 !.i
 u/0	i

 $E$45i
 !) 6i
 'u'78i
 "/2i
   12i
  ((9:i
 )*i
 +,i
 
"i
 i
 i
 ^ i
 i
 i
 i
 i
r0   r  c                       e Zd Zddedee         f fdZd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         deej                 deej                 deej                 dee         defd                        Z xZS )T5GemmaForTokenClassificationNry   rA   c                    |||_         t                                          |           |j        | _        |j         rt	          |          | _        nt          |          | _        |j        j        }|j         r|j	        j        }t          |dd          }t          || j        |          | _        |                                  dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        NrC   r  r  r  s        r1   rV   z&T5GemmaForTokenClassification.__init__]  s    
 )(:F%    +$ 	5%f--DJJ,V44DJn0$ 	5 .4K$V-FLL.{DOM_``
r0   c                 4    | j                                         S rt   r  rV  s    r1   rZ  z2T5GemmaForTokenClassification.get_input_embeddingsu  r  r0   c                 :    | j                             |           d S rt   r  r  s     r1   r^  z2T5GemmaForTokenClassification.set_input_embeddingsx  r  r0   r9   r<   r   rb  rc  rd  re  r:   rf  r  rY   r   c                    | j         j        r!||t          d| j        j         d          | j         j        r*|(|	&|t          d          |                     |          }| j         j        r. | j        |f||||||||	dd	|}|j        }|j	        }|j
        }n' | j        |f|||d|}|j        }|j        }|j        }|                     |          }d}|
|                     ||
| j                   }t          ||||          S )	r  Nr  r  r  Fr  r  r  )ry   rA   r  r[   r,   r   r  r   r<  ri  rj  r;   r  r  r  r   )rX   r9   r<   r   rb  rc  rd  re  r:   rf  r  rY   r  r<  r;   r  r   r  s                     r1   r   z%T5GemmaForTokenClassification.forward{  s   4 ;) 	y/@]E^%}4>Kb}}}   ;) 	=/@/HMbMj  U  
 !% 1 1) < <;) 	,*4$*+-)"3'=%9 /+&;+ + + +G !( 9#9M 3JJ'1tz(-)+	( (
 ( (G !( 9#1M +J-..%%ffdkBBD$'!	
 
 
 	
r0   rt   r  )r,   r-   r.   r4   r   rj   rV   rZ  r^  r   r   r   r   r   r   r   r   r   r   r   rm   rn   s   @r1   r  r  [  s        } (4.      01 1 1/ / /  1515378<9=;?5959=A-1N
 N
E,-N
 !.N
 u/0	N

 $E$45N
 !) 6N
 'u'78N
 "/2N
   12N
  ((9:N
 )*N
 +,N
 
N
 N
 N
 ^ N
 N
 N
 N
 N
r0   r  )r4   r*   rv  rR  rp  r   r  r  )Ttypingr   r   r   r   r   torch.nnru   cache_utilsr   r	   r
   configuration_utilsr   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   gemma2.configuration_gemma2r    gemma2.modeling_gemma2r!   r"   r#   r$   r%   r&   r'   r(   _CHECKPOINT_FOR_DOC
get_loggerr,   r  r*   r4   rp   rr   r   r   r   r   r   rl   r   r   r   Moduler   r   r   r   r  r  rH  rR  rp  rv  r  r  __all__r/   r0   r1   <module>r     s    2 1 1 1 1 1 1 1 1 1 1 1        C C C C C C C C C C 3 3 3 3 3 3 ) ) ) ) ) ) B B B B B B 9 9 9 9 9 9                G F F F F F F F & & & & & &            1 0 0 0 0 0 ? ? ? ? ? ? ? ? 6 6 6 6 6 6	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 9  
	H	%	%	 	 	 	 	, 	 	 	L( L( L( L( L($ L( L( L(^	 	 	 	 	] 	 	 		 	 	 	 	 	 	 	) ) ) ) )2 ) ) )
+ + + + +? + + +D) D) D) D) D)O D) D) D)N
0F 
8 
 
 
 
s x    1 1 1 1 14 1 1 1h8 8 8 8 8- 8 8 8v    	   	 	 	 	 	BI 	 	 	 /! /! /! /! /!2 /! /! /!d()< 3- \	   "Z
 Z
 Z
 Z
 Z
+ Z
 Z
 Z
zj
 j
 j
 j
 j
^ j
 j
 j
Z O
 O
 O
 O
 O
) O
 O
 O
d ! ! ! ! !0 ! ! !Ho) o) o) o) o)&<o o) o) o)d I
 I
 I
 I
 I
'= I
 I
 I
X o
 o
 o
 o
 o
$: o
 o
 o
d	 	 	r0   