
     `i_              	          d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 ddl
mZmZ ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<  e'j=        e>          Z? G d de+e          Z@ G d de          ZA G d de:          ZB G d de7          ZC G d de	jD                  ZE G d d e/          ZF G d! d"e2          ZG G d# d$e3          ZH G d% d&e-          ZI G d' d(e          ZJdZK G d) d*e1          ZLd+eMd,eeMeMeMeMgeNf         fd-ZO G d. d/e0          ZP G d0 d1e.          ZQ G d2 d3e	jR                  ZSd4eejT                 d5eejT                 d6eMd,ee         fd7ZU G d8 d9e9          ZV G d: d;e8          ZW G d< d=eL          ZX G d> d?eeL          ZYg d@ZZdS )A    N)Callable)AnyOptionalUnion   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPast SequenceClassifierOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                   T    e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZdS )Gemma3TextConfiga,!  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        use_bidirectional_attention (`bool`, *optional*, defaults to `False`): If True, the model will attend to all
            text tokens instead of using a causal mask. This does not change behavior for vision tokens.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_text@   	   $              gelu_pytorch_tanh   {Gz?ư>Tr      r       .AF           N     @c                 ~    t          j        d||||d| | _        |	 _        | _        | _        | _        | _        | _        | _	        |
 _
        | _        | _        | _        | _        | _        | _        | _        | _        | _        | _        | _        | _        |r j        dz  dz    _        | _        | _        t3                      |                    dd           _         j        % fdt9           j                  D              _        t;           j         j                   d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingsr   r;   sliding_window_pattern   c                 L    g | ] }t          |d z   j        z            rdnd!S )r;   sliding_attentionfull_attention)bool_sliding_window_pattern).0iselfs     }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/gemma3/modular_gemma3.py
<listcomp>z-Gemma3TextConfig.__init__.<locals>.<listcomp>   sI           (,QUd6R,R'S'Si##Yi          )r
   __init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesuse_bidirectional_attentionrope_local_base_freqrope_scalingr   getrK   ranger   )rN   rT   rV   rW   rX   rY   r[   rZ   rb   rU   r\   r]   r^   rA   rC   rB   rD   r_   r`   ra   rc   rd   rg   re   rf   rj   ri   rh   kwargss   `                            rO   rS   zGemma3TextConfig.__init__   s   > 	! 	
%%% 3		
 	

 	
 	
 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#&+F(& 	A#'#6!#;q"@D$8!(t$$$ (.zz2JA'N'N$#       t566     D 	d.0FGGGGGrQ   )r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   Tr   r;   r   Tr<   Fr=   r6   r>   NNNNr?   F)__name__
__module____qualname____doc__
model_typerS   rR   rQ   rO   r.   r.   :   s        t tl J - ' ! $#%$)9JH JH JH JH JH JHrQ   r.   c                        e Zd ZdZdZddddZeedZ	 	 	 	 	 	 	 dde	e
eeeef         f                  de	e
eeeef         f                  dededededef fdZ xZS )Gemma3Configa  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNr6         r9   r|   r}   mm_tokens_per_imager\   c                    |)t                      }t                              d           n!t          |t                    rt          di |}t          |t                    rt          di |}n*|(t                      }t                              d           || _        || _        || _        || _	        || _
        || _        || _         t                      j        di | d S )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rR   )r.   loggerinfo
isinstancedictr,   r|   r}   r   rw   rx   rv   r\   superrS   )
rN   r|   r}   r   rw   rx   rv   r\   rm   	__class__s
            rO   rS   zGemma3Config.__init__<  s     *,,KKKZ[[[[T** 	:*99[99KmT** 	b.????MM".00MKK`aaa&*#6 ..!2!2""6"""""rQ   )NNr6   r~   r   r   r9   )rn   ro   rp   rq   rr   attribute_mapr.   r,   sub_configsr   r   r   strr   intfloatrS   __classcell__r   s   @rO   rt   rt      s
       . .` J-)) M (+ K JNMQ#&&&!(#'# #e$4d38n$DEF#  &8$sCx.&H IJ# !	#
 # # # !# # # # # # # # # #rQ   rt   c                       e Zd ZdS )Gemma3ModelOutputWithPastNrn   ro   rp   rR   rQ   rO   r   r   ^          DrQ   r   c                       e Zd ZdS )Gemma3CausalLMOutputWithPastNr   rR   rQ   rO   r   r   b  r   rQ   r   c            	       P     e Zd ZdZd
dedededef fdZdej        f fd	Z	 xZ
S )Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?num_embeddingsembedding_dimpadding_idxembed_scalec                     t                                          |||           |                     dt          j        |          d           d S )Nr   F)
persistent)r   rS   register_buffertorchtensor)rN   r   r   r   r   r   s        rO   rS   z&Gemma3TextScaledWordEmbedding.__init__k  sK    DDD]EL,E,ERWXXXXXrQ   	input_idsc                     t                                          |          | j                            | j        j                  z  S N)r   forwardr   toweightdtype)rN   r   r   s     rO   r   z%Gemma3TextScaledWordEmbedding.forwardo  s4    wwy))D,<,?,?@Q,R,RRRrQ   )r   )rn   ro   rp   rq   r   r   rS   r   Tensorr   r   r   s   @rO   r   r   f  s         Y Ys Y3 YS Y_d Y Y Y Y Y YS S S S S S S S S S SrQ   r   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 J    t                                          |           d S r   r   rS   rN   r   r   s     rO   rS   zGemma3MLP.__init__t  !         rQ   rn   ro   rp   r.   rS   r   r   s   @rO   r   r   s  sE        !/ ! ! ! ! ! ! ! ! ! !rQ   r   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormr:   dimepsc                 N    t                                          ||           d S )Nr   r   r   )rN   r   r   r   s      rO   rS   zGemma3RMSNorm.__init__y  s&    Sc*****rQ   )r:   )rn   ro   rp   r   r   rS   r   r   s   @rO   r   r   x  sP        + +C +e + + + + + + + + + +rQ   r   c                   &     e Zd Zddef fdZ xZS )Gemma3RotaryEmbeddingNr   c                 J    t                                          |           d S r   r   )rN   r   devicer   s      rO   rS   zGemma3RotaryEmbedding.__init__~  r   rQ   r   r   r   s   @rO   r   r   }  sJ        ! !/ ! ! ! ! ! ! ! ! ! !rQ   r   c                   &    e Zd Zdedef fdZ eddd          	 	 dd	ej        d
ej        de	ej                 de	e
         de	ej                 dee         deej        e	ej                 e	eej                          f         fd            Z xZS )Gemma3Attentionr   	layer_idxc                 F   |j         |         dk    | _        t                                          ||           | j        r|j        nd | _        | j        j         | _        t          |j	        |j
                  | _        t          |j	        |j
                  | _        d S )NrH   r   )rg   
is_slidingr   rS   rd   r   rh   	is_causalr   rZ   r]   q_normk_normrN   r   r   r   s      rO   rS   zGemma3Attention.__init__  s     ,Y7;NN+++7;Pf33D![DD#V=PQQQ#V=PQQQrQ   past_key_valuepast_key_values4.58new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionrm   returnc                    |j         d d         }g |d| j        R }|                     |                              |                              dd          }	|                     |                              |                              dd          }
|                     |                              |                              dd          }|                     |	          }	|                     |
          }
|\  }}t          |	|
||          \  }	}
|&|||d}|
                    |
|| j        |          \  }
}t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j        r| j        nd| j        | j        d|\  }} |j        g |dR                                  }|                     |          }||fS )Nr;   r   )sincosr   eagerr=   )dropoutscalingrd   )shaperZ   q_projview	transposek_projv_projr   r   r&   updater   r'   r   _attn_implementationr   trainingra   r   rd   reshape
contiguouso_proj)rN   r   r   r   r   r   rm   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     rO   r   zGemma3Attention.forward  s    $)#2#.88b8$-88{{=1166|DDNNqRSTT[[//44\BBLLQPQRR
{{=1166|DDNNqRSTT{{<00[[,,
&S#7jRUWZ#[#[ j&#&snUUL'6'='=j,X\Xfht'u'u$J(?;+w66"9$+:Z"[$7$7
%
 /3mDD**L.
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((rQ   )NN)rn   ro   rp   r.   r   rS   r   r   r   r   r   
LongTensorr   r   tupler   r   r   s   @rO   r   r     s       R/ RC R R R R R R _%0A6RRR ,059-) -)|-) #\-) !.	-)
 "%-) !!12-) -.-) 
u|Xel3XeEL>Q5RR	S-) -) -) SR-) -) -) -) -)rQ   r   c                   \    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        dej        dej        de	ej                 de	ej
                 de	e         de	e         de	e         de	ej
                 deej        e	eej        ej        f                  f         fd            Z xZS )Gemma3DecoderLayerr   r   c                    t                                                       || _        |j        | _        || _        |j        |         | _        t          ||          | _        t          |          | _
        t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        d S )N)r   r   r   )r   rS   r   rV   r   rg   attention_typer   	self_attnr   mlpr   r]   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rO   rS   zGemma3DecoderLayer.__init__  s    !-"$0;()LLLV$$,T-=6CVWWW(5d6FFL_(`(`(`%)6t7GVM`)a)a)a&*78HfNa*b*b*b'''rQ   r   r   r   r   NFr   position_embeddings_globalposition_embeddings_localr   position_idsoutput_attentionsr^   r   r   c
                 ^   |}|                      |          }| j        j        r|}n|} | j        d||||||||	d|
\  }}|                     |          }||z   }|}|                     |          }|                     |          }|                     |          }||z   }|f}|r||fz  }|S )N)r   r   r   r   r   r   r^   r   rR   )r   r   r   r   r   r   r   )rN   r   r   r   r   r   r   r   r^   r   rm   residualr   self_attn_weightsoutputss                  rO   r   zGemma3DecoderLayer.forward  s    !,,];; >$ 	=";"<+94> 
,
' 3)%+/)
,
 
,
 
,
 
,
(( 55mDD =0 66}EE//77FF =0 " 	,)++GrQ   )NNNFFN)rn   ro   rp   r.   r   rS   r   r   r   r   r   r   rJ   r   FloatTensorr   r   r   s   @rO   r   r     sO       c/ cC c c c c c c _%0A6RRR 2637+/,1$)590 0|0 %*L0 $)<	0
 !.0 u/00 "%0 $D>0 D>0 !!120 
u (51BEDU1U+V"WW	X0 0 0 SR0 0 0 0 0rQ   r   c                        e Zd ZdZg dZd ZdS )Gemma3PreTrainedModel )r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                     t          j        | |           t          |t                    r |j        j                                         d S d|j        j        v r |j	        j                                         d S d S )NRMSNorm)
r   _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdatazero_r   rn   r   )rN   modules     rO   r
  z#Gemma3PreTrainedModel._init_weights  s|    %dF333f788 	'-288:::::&*333M$$&&&&& 43rQ   N)rn   ro   rp   base_model_prefix_no_split_modulesr
  rR   rQ   rO   r  r    s;          ' ' ' ' 'rQ   r  rd   r   c           
      Z     dt           dt           dt           dt           dt          f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 0    t          ||z
            k     S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r  r  r  r  rd   s       rO   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_mask  s     56>""^33rQ   r   rJ   )rd   r  s   ` rO   _bidirectional_window_overlayr    sL    
4c 4S 4 4c 4d 4 4 4 4 4 4
 rQ   c                   "    e Zd ZU eed<   def fdZ	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 dee
         deej                 d	ee         d
ee         dee         deej                 dee         defdZ xZS )Gemma3TextModelr   c                 .   t                                          |           t          |j        |j        | j        | j        j        dz            | _        t          j	        |          }|j
        |_        ddi|_        t          |          | _        d S )N      ?)r   	rope_typedefaultr   )r   rS   r   rT   rV   r   r   embed_tokenscopydeepcopyri   r_   rj   r   rotary_emb_localr   s     rO   rS   zGemma3TextModel.__init__&  s        :v143CQUQ\QhjmQm
 
 
 v&&"7*I6 5V D D DrQ   Nr   r   r   r   inputs_embedsr^   r   output_hidden_statesr   rm   r   c
                    ||n| j         j        }||n| j         j        }||n| j         j        }|d u |d uz  rt	          d          | j        r%| j        r|rt                              d           d}|| 	                    |          }|r|| j        st          | j                   }|	B||                                nd}t          j        |||j        d         z   |j                  }	||	                    d          }t#          |x}t$                    si| j         |||	||d}|                                }| j         j        r"d	 |d
<   t+          | j         j                  |d
<   t/          di |t1          di |d}|}|                     ||          }|                     ||          }|rdnd }|rdnd }| j        d | j         j                 D ]=}|r||fz  } ||f||||j                 |||||	d|
}|d         }|r||d         fz  }>|                     |          }|r||fz  }t?          ||||          S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr"  r   r;   r   r   input_embedsr   r   r   r   c                  B    t          j        dt           j                  S )NT)r   )r   r   rJ   )argss    rO   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>p  s    TY^Yc@d@d@d rQ   or_mask_functionrI   rH   rR   )r   r   r   r   r   r   r^   r   )last_hidden_stater   r   
attentions) r   r   r(  r^   
ValueErrorgradient_checkpointingr   r   warning_oncer#  r	   get_seq_lengthr   aranger   r   	unsqueezer   r   r$  rh   r  rd   r   r   
rotary_embr&  layersrX   r   normr   )rN   r   r   r   r   r'  r^   r   r(  r   rm   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                         rO   r   zGemma3TextModel.forward5  sU    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M 	?00*$+>>>O!CRC^==???de"\  =#6q#99$+  N )33A66L ?-FF 	 + -"0"0#2 , K #."2"2"4"4{6 t2d2d./:WX\XcXr:s:s#$67 #5"C"C{"C"C%F%]%]I\%]%]# # & &*__]L%Q%Q"$($9$9-$V$V! #7@BBD0:d![)H4;+H)HI 	6 	6M# 6!m%55!)M+E*C2=3OP) /"3#-   M *!,M  6=#3"55		-00 	2-!11&+++%	
 
 
 	
rQ   	NNNNNNNNN)rn   ro   rp   r.   __annotations__rS   r   r   r   r   r   r  rJ   r   r   r   r   r   r   s   @rO   r  r  #  sM        E/ E E E E E E" 151537+/59$(,0/359o
 o
E,-o
 !.o
 u/0	o

 "%o
   12o
 D>o
 $D>o
 'tno
 !!12o
 +,o
 
!o
 o
 o
 o
 o
 o
 o
 o
rQ   r  c                   4     e Zd ZU eed<   dZdef fdZ xZS )Gemma3ForCausalLMr   language_modelc                 r    t                                          |           t          |          | _        d S r   )r   rS   r  modelr   s     rO   rS   zGemma3ForCausalLM.__init__  s.       $V,,


rQ   )rn   ro   rp   r.   rG  r  rS   r   r   s   @rO   rI  rI    sY         (-/ - - - - - - - - - -rQ   rI  c                   :     e Zd Zdef fdZdej        fdZ xZS )r  r   c                    t                                                       t          j        t	          j        |j        j        |j        j                            | _	        t          |j        j        |j        j                  | _        t          |j        j        |j        j        z            | _        t          |j        dz            | _        | j        | j        z  | _        t          j        | j        | j                  | _        d S )Nr   r  )kernel_sizestride)r   rS   nn	Parameterr   zerosr}   rV   r|   r  r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager   tokens_per_siderO  	AvgPool2davg_poolr   s     rO   rS   z"Gemma3MultiModalProjector.__init__  s    *,,K,8&:L:XYY+
 +
' !. ,&2F2U!
 !
 !
 "%V%9%DH\Hg%g!h!h"6#=s#BCC1T5II1A$JZ[[[rQ   vision_outputsc                    |j         \  }}}|                    dd          }|                    ||| j        | j                  }|                                }|                     |          }|                    d          }|                    dd          }|                     |          }t          j	        || j
                  }|                    |          S )Nr;   r   )r   r   r   rX  r   r[  flattenrU  r   matmulr  type_as)	rN   r\  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            rO   r   z!Gemma3MultiModalProjector.forward  s    $2$8!
Az"0":":1a"@"@"9"A"A
D$:D<R#
 #
 #:"D"D"F"F $.E F F 5 = =a @ @ 5 ? ?1 E E $ 5 56K L L#(<0EtGf#g#g '//???rQ   )	rn   ro   rp   rt   rS   r   r   r   r   r   s   @rO   r  r    sq        \| \ \ \ \ \ \ @el @ @ @ @ @ @ @ @rQ   r  token_type_idsimage_group_idstokens_per_imagec           
      f      dS dt           dt           dt           dt           dt          f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Nr  r  r  r  r   c                 Z   t          j        |
j        d         k     |d          }
| |f         }t          j        |
j        d         k     |d          }	| |f         }t          j        |	j        d         k     |d          }
| |f         dk    |dk    z  }	| |f         |k    }||z  S )Nr;   r   r   )r   wherer   )r  r  r  r  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockri  rh  s            rO   r  z0token_type_ids_mask_function.<locals>.inner_mask  s     ;v(<Q(??KK#1)X2E#F #(;v8LQ8O/OQikl#m#m $3Ix4G$H!$)K9Nq9Q0QSlnp$q$q!(E)9:a?D\`aDab*9e+;<@YY  000rQ   r  )rh  ri  rj  r  s   ``  rO   token_type_ids_mask_functionrs    s_     t1c 1S 1 1c 1d 1 1 1 1 1 1 1" rQ   c            !           e Zd ZdZdef fdZdej        dej        fdZd Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                 deej                 d
eej                 deej                 dee         deej                 deej                 deej                 deej                 dee         dee         dee         dee         deeef         fd                        Z xZS )Gemma3ModelFr   c                 N    t                                          |           | `d S r   )r   rS   text_config_dtyper   s     rO   rS   zGemma3Model.__init__  s'       """rQ   pixel_valuesr   c                 f    |                      |          j        }|                     |          }|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )rx  )vision_towerr3  multi_modal_projector)rN   rx  r\  image_featuress       rO   get_image_featureszGemma3Model.get_image_features  s6     ***EEW33NCCrQ   c                      t          d          NzWe don't want to inherit itAttributeErrorrN   super_kwargss     rO   _update_causal_maskzGemma3Model._update_causal_mask      :;;;rQ   Nr   r   r   r   rh  r   r'  labelsr^   r   r(  return_dictc                    |d u |d uz  rt          d          ||n| j        j        }||n| j        j        }||n| j        j        }|?| j        j        | j        k    r*|| j        j        k    }|                                }d||<   n|}| |                                 |          }|B||	                                nd}t          j        |||j        d         z   |j                  }|c|                     |          }|                    |j        |j                  }|                     |||          }|                    ||          }t'          |x}t(                    s1| j                                        |||||d}|
 p|d u p|j         p|d u}||r|dk                        |j                  }|t.          j                            |dd          d d d d	f          z  }t          j        |                                d
          dz
  }t          j        ||t          j        |d	|j                            }t=          |                    |j                  || j        j                  |d<   tA          di |tC          di |d} | j"        d|||||
||d|d	|}tG          |j$        |
r|j%        nd |j&        |j'        ||nd           S )Nr*  r   r;   r+  )r'  r|  r,  r;   r   valuer   r   r1  r2  T)	r   r   r   r'  r^   r   r(  r  r   )r3  r   r   r4  image_hidden_statesrR   )(r5  r   r   r(  use_return_dictry   rT   cloneget_input_embeddingsr8  r   r9  r   r   r}  r   r   get_placeholder_maskmasked_scatterr   r   get_text_configis_initializedrQ  
functionalpadcumsumr   rm  	full_likers  r   r   r   rJ  r   r3  r   r   r4  )rN   r   rx  r   r   r   rh  r   r'  r  r^   r   r(  r  	lm_kwargsspecial_image_maskllm_input_idsr>  r|  r?  r@  
is_prefillis_imagenew_image_startri  r   s                             rO   r   zGemma3Model.forward  s   & -t";< 	[YZZZ1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]  T[%?4?%R%R!*dk.H!H%OO--M01M,--%M 7D5577FFM!CRC^==???de"\ "2]5H5K"KTaTh  N
 #!44\BBN+..}/C]EXYYN!%!:!:~ "; " " *889K^\\M ?-FF '	 +5577 -"0"0#2 , K  ,"d*,&55,  t+	  )j) +a/33N4IJJ"*bm.?.?&XY.?.Z.Z[\[\[\^a_a^a[a.b-b"b"',/B/B/D/D!"L"L"Lq"P"'+ou~rZbZi/j/j/j# # 3O"%%n&;<<ot{On3 3./ #5"C"C{"C"C%F%U%U%U%U# #
 &$% 
.%+'/!5)
 
 
 
 )%77@JG33d!/)2>2JPT
 
 
 	
rQ   )NNNNNNNNNNNNN)rn   ro   rp   accepts_loss_kwargsrt   rS   r   r   r}  r  r   r   r   r   r  r   rJ   r   r   r   r   r   r   s   @rO   ru  ru    s       #| # # # # # #u|     < < <  15481537+/595959-1$(,0/3&*q
 q
E,-q
 u01q
 !.	q

 u/0q
 "%q
 !!12q
 !!12q
   12q
 )*q
 D>q
 $D>q
 'tnq
 d^q
  
u//	0!q
 q
 q
 ^ q
 q
 q
 q
 q
rQ   ru  c            "       d    e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej	                 deej                 dee
         d	eej                 d
eej                 deej                 deej                 dee         dee         dee         dee         deeej	        f         deeef         fd            Z	 	 	 	 	 	 	 	 	 	 d fd	Zd Ze	 ddedej	        deej	                 d
ej	        dee
         deej	                 d	eej	                 defd            Z xZS )Gemma3ForConditionalGenerationFNr   r   rx  r   r   r   rh  r   r'  r  r^   r   r(  r  logits_to_keepr   c                    ||n| j         j        }||n| j         j        }||n| j         j        } | j        d||||||||
|	||||d|}|d         }t          |t                    rt          | d          n|}|                     |dd|ddf                   }d}|	i|	                                }|dddddf         }|	dddf         }||dd|j
        d          df                             |j                  }||                    |j                  dk                                             }||                    |j                  dk                                             }n(|                                }|                                }t          j                    }|                    d| j         j        j                  }|                    d                              |j                  } |||          }|s|f|dd         z   }||f|z   n|S t'          |||j        |j        |j        |j                  S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)r   rx  rh  r   r   r   r'  r^   r  r   r(  r  r   r   .r   r;   )losslogitsr   r   r4  r  rR   )r   r   r(  r  rL  r   r   slicelm_headr   r   r   r   r   rQ  CrossEntropyLossr   r|   rT   r   r   r   r4  r  )rN   r   rx  r   r   r   rh  r   r'  r  r^   r   r(  r  r  r  r   r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsoutputs                               rO   r   z&Gemma3ForConditionalGeneration.forward  s   @ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$* 
%))%+'/!5#)
 
 
 
"  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA\\^^F!#ssAAA+.L!#qrr'?L) (6aaa,:LQ:O9O9Q9Q6Q'R'U'UV\Vc'd'd$+,@,C,CFM,R,RVW,WXccee+,@,C,CLDW,X,X\],]^iikk+6688+6688*,,H&++B0G0RSSK&++B//22<3FGGK8K55D 	DY,F'+'7D7V##VC+#3!/) ' ;
 
 
 	
rQ   Tc                 p     t                      j        |f||||||	|
|d|}|d         dk    r||d<   |S )N)r   r'  r   r   r   r^   r  rh  r   rx  )r   prepare_inputs_for_generation)rN   r   r   r'  r   r   rx  r   rh  r^   r  r  rm   model_inputsr   s                 rO   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  sn      =uww<
+')%)))
 
 
 
 !!!+7L(rQ   c                      t          d          r  r  r  s     rO   5_prepare_4d_causal_attention_mask_with_cache_positionzTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position,  r  rQ   r   r-  c                     |                                  |||||d}||j        d         dk    r|dk                        |j                  }	|	t          j                            |	dd          d d d df          z  }
t          j        |
	                                d          dz
  }t          j
        |	|t          j        |d                    }t          |                    |j                  || j                  |d<   t          d	i |S )
Nr,  r;   r  r   r  r   r  r1  rR   )r  r   r   r   rQ  r  r  r   r  r   rm  r  rs  r   r   )r   r-  r   r   r   r   rh  rm   r@  r  r  ri  s               rO   r   z8Gemma3ForConditionalGeneration.create_masks_for_generate/  s.    ,,..(,,.(
 
 %,*<Q*?1*D*D
 '!+//0EFFH&"-*;*;HfTU*;*V*VWXWXWXZ][]Z]W]*^)^^O#l?+>+>+@+@aHHH1LO#k(OU_UcegEhEhiiO.J!!."788/6Ke/ /K*+ )77;777rQ   )NNNNNNNNNNNNNr   )
NNNNNNNTNNr   )rn   ro   rp   r  r   r   r   r   r  r   r   rJ   r   r   r   r   r   r  r  staticmethodr
   r   r   r   r   s   @rO   r  r    sj          15481537+/595959-1$(,0/3&*34|
 |
E,-|
 u01|
 !.	|

 u/0|
 "%|
 !!12|
 !!12|
   12|
 )*|
 D>|
 $D>|
 'tn|
 d^|
 c5</0|
" 
u22	3#|
 |
 |
 ^|
B " " " " " "H< < <  26!8 !8 !8l!8 !.!8 	!8
 "%!8 u|,!8 !.!8 
!8 !8 !8 \!8 !8 !8 !8 !8rQ   r  c                   \    e Zd ZddddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dd	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 de	e
j                 de	e         dee         defd                        Z xZS )Gemma3ForSequenceClassificationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projector)z^language_model.modelz^vision_towerz^multi_modal_projectorc                    t                                          |           |j        | _        t          |          | _        t          j        |j        j        | j        d          | _	        | 
                                 d S )NF)bias)r   rS   
num_labelsru  rL  rQ  Linearr|   rV   score	post_initr   s     rO   rS   z(Gemma3ForSequenceClassification.__init__[  sm        + ((
Yv1=tUZ[[[
 	rQ   c                 4    | j                                         S r   )rL  r  )rN   s    rO   r  z4Gemma3ForSequenceClassification.get_input_embeddingsd  s    z..000rQ   c                 :    | j                             |           d S r   )rL  set_input_embeddings)rN   r  s     rO   r  z4Gemma3ForSequenceClassification.set_input_embeddingsg  s    
''.....rQ   Nr   rx  r   r   r   r'  rh  r  r^   rm   r   c
                 >    | j         |f|||||||	d|
}|j        }|                     |          }||j        d         }n|j        d         }| j        j        j        |dk    rt          d          | j        j        j        d}n||| j        j        j        k                        |j	        t          j                  }t          j        |j        d         |j	        t          j                  }||z                      d          }n)d}t                              | j        j         d           |t          j        ||j	        	          |f         }d}||                     |||| j        
          }t'          |||j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rx  r   r   r'  rh  r^   Nr   r;   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r+  )r  r  pooled_logitsr   )r  r  r   r   r4  )rL  r3  r  r   r   r|   rA   r5  r   r   r   int32r9  argmaxr   r7  r   rn   loss_functionr   r   r   r4  )rN   r   rx  r   r   r   r'  rh  r  r^   rm   transformer_outputsr   r  ra  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       rO   r   z'Gemma3ForSequenceClassification.forwardj  s   , )dj

)%%+')

 

 

 

 ,=M** "+JJ&,Q/J;"/7J!OO\]]];"/7!#"%)@)MMQQRXR_afalmmL!L)<V]Z_ZefffM"/,">!F!Fr!J!J!#>* Z Z Z  
 u|Jv}MMMOaab%%VFR_hlhs%ttD/ /?-;*5
 
 
 	
rQ   rF  )rn   ro   rp   _checkpoint_conversion_mappingrS   r  r  r   r   r   r   r   r  r   r   rJ   r   r   r   r   r   r   s   @rO   r  r  T  s~       !7-"?& &"    1 1 1/ / /  15481537+/5959-1$(C
 C
E,-C
 u01C
 !.	C

 u/0C
 "%C
   12C
 !!12C
 )*C
 D>C
 +,C
 
*C
 C
 C
 ^ C
 C
 C
 C
 C
rQ   r  c                       e Zd ZU dZeed<   dS )#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    r   N)rn   ro   rp   rq   r.   rG  rR   rQ   rO   r  r    s*          
 rQ   r  )	rt   r.   r  r  rI  r  ru  r  r  )[r$  collections.abcr   typingr   r   r   r   torch.nnrQ  cache_utilsr   r	   configuration_utilsr
   r   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r    r!   r"   r#   r$   r%   r&   r'   paligemma.modeling_paligemmar(   r)   r*   r+   siglipr,   
get_loggerrn   r   r.   rt   r   r   	Embeddingr   r   r   r   r   r   GEMMA3_START_DOCSTRINGr  r   rJ   r  r  rI  Moduler  r   rs  ru  r  r  r  __all__rR   rQ   rO   <module>r     s:     $ $ $ $ $ $ ' ' ' ' ' ' ' ' ' '        . . . . . . . . J J J J J J J J m m m m m m m m m m B B B B B B [ [ [ [ [ [ [ [ Y Y Y Y Y Y Y Y 9 9 9 9 9 9 F F F F F F F F & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 6 6 6 6 6 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
            ( ' ' ' ' ' 
	H	%	%CH CH CH CH CH|%5 CH CH CHL[# [# [# [# [## [# [# [#|	 	 	 	 	 < 	 	 		 	 	 	 	#B 	 	 	
S 
S 
S 
S 
SBL 
S 
S 
S! ! ! ! !	 ! ! !
+ + + + +M + + +
! ! ! ! !1 ! ! !9) 9) 9) 9) 9)o 9) 9) 9)x? ? ? ? ?3 ? ? ?D  ' ' ' ' '1 ' ' '$
# 
(CcSVCWY]C]:^ 
 
 
 
A
 A
 A
 A
 A
k A
 A
 A
H- - - - -) - - -!@ !@ !@ !@ !@	 !@ !@ !@HU\*el+  h	   BL
 L
 L
 L
 L
. L
 L
 L
^M8 M8 M8 M8 M8%F M8 M8 M8`[
 [
 [
 [
 [
&; [
 [
 [
|    *JLa   
 
 
rQ   