
     `iӷ                    (   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmc mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7  e.j8        e9          Z:e e,d           G d de                                  Z;e e,d           G d de*                                  Z< G d dej=                  Z> G d d ej=                  Z? G d! d"ej=                  Z@ G d# d$ej=                  ZA G d% d&ej=                  ZB G d' d(ej=                  ZC G d) d*ej=                  ZD G d+ d,ej=                  ZE G d- d.ej=                  ZF G d/ d0ej=                  ZG G d1 d2e&          ZH G d3 d4ejI                  ZJ G d5 d6ej=                  ZK G d7 d8ej=                  ZL G d9 d:ej=                  ZM G d; d<ej=                  ZNd= ZOd>e
jP        d?eQd@e
jP        fdAZR	 	 	 dgdCej=        dDe
jP        dEe
jP        dFe
jP        dGee
jP                 dHeSdIeeS         dJeeS         d@eTe
jP        e
jP        f         fdKZU	 	 dhdLe
jP        dMe
jP        dNe
jP        dOee
jP                 dPeQf
dQZV G dR dSej=                  ZW G dT dUe          ZXe, G dV dWe&                      ZY e,dX           G dY dZeY                      ZZ e,d[           G d\ d]eYe                      Z[ G d^ d_ej=                  Z\ e,d`           G da dbeY                      Z] e,dc           G dd deeYe                      Z^g dfZ_dS )i    N)CallableSequence)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigzL
    Base class for Gemma3n outputs, with hidden states and attentions.
    )custom_introc                   \    e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dS )Gemma3nModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)
__name__
__module____qualname____doc__r(   r   torchFloatTensor__annotations__r)        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr'   r'   3   sP           8<%"34;;;7;%"34;;;;;r2   r'   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dZeej                 ed	<   dS )
Gemma3nCausalLMOutputWithPastaF  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr(   r)   )r*   r+   r,   r-   r6   r   r.   r/   r0   r7   r8   r
   r9   tupler:   r(   r)   r1   r2   r3   r5   r5   M   s          $ )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju0129997;%"34;;;7;%"34;;;;;r2   r5   c                   ^     e Zd Zddededef fdZd Zdej	        d	ej	        fd
Z
d Z xZS )Gemma3nRMSNormư>Tdimeps
with_scalec                     t                                                       || _        || _        | j        r-t	          j        t          j        |                    | _        d S | 	                    dt          j
        d          d           d S )Nweight      ?F
persistent)super__init__r@   rA   nn	Parameterr.   onesrC   register_buffertensor)selfr?   r@   rA   	__class__s       r3   rH   zGemma3nRMSNorm.__init__q   sz    $? 	P,uz#77DKKK  5<+<+< OOOOOr2   c                     |t          j        |                    d                              dd          | j        z             z  S )Nr   T)keepdim)r.   sqrtpowmeanr@   )rN   xs     r3   _normzGemma3nRMSNorm._norm{   s8    5:aeeAhhmmBm==HIIIIr2   rV   returnc                     |                      |                                          | j                                        z  }|                    |          S N)rW   floatrC   type_as)rN   rV   outputs      r3   forwardzGemma3nRMSNorm.forward~   sB     AGGII&&):):)<)<<~~a   r2   c                 H    t          | j        j                   d| j         S )Nz, eps=)r;   rC   shaper@   rN   s    r3   
extra_reprzGemma3nRMSNorm.extra_repr   s%    )**<<$(<<<r2   )r>   T)r*   r+   r,   intr[   boolrH   rW   r.   Tensorr^   rb   __classcell__rO   s   @r3   r=   r=   p   s        P PC Pe P P P P P P PJ J J! !%, ! ! ! != = = = = = =r2   r=   c                        e Zd Zdef fdZdej        dej        dej        fdZdej        de	d	e	d
e	de	de	de	dej        fdZ
dej        dej        dej        fdZ xZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                 $   t                                                       || _        | j        j        | _        | j        j        | _        | j        | j        z  | _        t          d| j        j	        dz
            | _
        | j        j        | _        t          j        | j        | j        | j        z  d          | _        d}d}| j        dz  }t!          j        t%          |          t%          |          z            t          |dz
  d          z  }|t'          j        t'          j        |          | z            z  }|                     d|                                                    d                              d          d	           d S )
Nr   r    FbiasrD   g     @r   inv_timescalesrE   )rG   rH   rj   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrI   Linearpos_projmathlogr[   r.   exparangerL   	unsqueeze)rN   rj   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrn   rO   s          r3   rH   z.Gemma3nAudioRelativePositionEmbedding.__init__   sg   =/74;#JQ#NOO;C	$-$-1OV[\\\!+"&(5+?+?%BVBV+V"W"WZ]^lop^prsZtZt"t&5<3O3OSjRj3j)k)kk  "",,Q//99!<< 	 	
 	
 	
 	
 	
r2   positiondtyperX   c                 N   |                                                     d          }|| j                            |j        t
          j                  z  }t          j        t          j        |          t          j	        |          gd          }|
                    |          S )NrQ   )devicer   r?   )r[   r   rn   tor   r.   float32catsincostype)rN   r   r   scaled_timetiming_signals        r3   _get_timing_signal_1d_posz?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s    >>##--b11!4!7!7xV[Vc!7!d!dd	59[#9#959[;Q;Q"RXZ[[[!!%(((r2   term_bd_before_shift
batch_sizerp   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     |dz   |z
  }d|f}	t           j                            ||	          }
|
                    |||||dz   z  f          }|ddddddd||z  f         }|                    |||||f          }|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r    r   N)rI   
functionalpadreshape)rN   r   r   rp   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r3   _relative_shiftz5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?OO
 *11  $4q$89	
 
 *!!!QQQ3X5EHX5X3X*XY )00   
 
 r2   querieskeysc           	      X   |j         \  }}}}}|j         \  }}}	}}t          j        | j        | j         dz
  d|j                                      d          }
|
j         d         }|                     |
|j                  }| 	                    |          }|
                    d|| j        | j                                      d          }|                    ddddd          }|                    ddddd          }t          j        ||          }|                    ddddd          }|                    ddd          }|
                    ||||z  |          }t          j        ||          }|
                    |||||          }|                     ||||||	|          }||z   S )	Nr    rQ   r   r   r   r   r      )r`   r.   r~   rv   rx   r   r   r   r   rz   r   rp   rs   squeezepermutematmulr   )rN   r   r   r   r   r   rp   rs   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r3   r^   z-Gemma3nAudioRelativePositionEmbedding.forward   s    OVmK
$&6	8'+z$11 l4#4t7G6G!6KRX_Xfgggqq
 
 &+A. $ > >w} !? !
 !

 !MM*?@@#++APTP]^^ff
 
 OOAq!Q22	<<1aA..,y(33 __Q1a33
 __Q1--
  ''
I?ORb?bdlmm

 #(,z:"F"F 3::
 
 ..
 
 ((r2   )r*   r+   r,   r!   rH   r.   re   r   r   rc   r   r^   rf   rg   s   @r3   ri   ri      s       
1 
 
 
 
 
 
.)%, )u{ )W\Wc ) ) ) );#l; ; 	;
 ; ; ; ; 
; ; ; ;zL)u| L)5< L)EL L) L) L) L) L) L) L) L)r2   ri   c                        e Zd Zdef fdZdej        dededej        fdZdej        dej        fd	Z	dej        dej        fd
Z
dej        dej        dej        fdZ xZS )Gemma3nAudioAttentionrj   c                    t                                                       || _        | j        j        | _        | j        j        | _        | j        | j        z  | _        | j        j        | _        | j        j	        | _
        t          d| j        j        dz
            | _        | j        j        | _        | j        | j        z   | j
        z   | _        t#          |          | _        t'          j        t+          j        | j        f                    | _        t'          j        | j        | j        | j        z  d          | _        t'          j        | j        | j        | j        z  d          | _        t'          j        | j        | j        | j        z  d          | _        | j        dz  }dt*          j        j                            t+          j        d                    z  }|                     d||z                                   !                                d	           t+          j"        t+          j#        | j        | j        ft*          j$        
          d          j%        }t+          j"        t+          j#        | j        | j        ft*          j$        
          | j        | j
        z             }t+          j#        | j        | j        ft*          j$        
          }||z  |z  }|                     d|d	           |                     dt+          j        | j                  &                                d	           d S )Nr   r    Frl         rD           q_scalerE   r   )diagonallocal_causal_valid_masksoftcap)'rG   rH   rj   ro   rp   rq   rs   conf_attention_chunk_size
chunk_sizerw   max_future_horizonrt   ru   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizeri   relative_position_embeddingrI   rJ   r.   zerosper_dim_scalery   q_projk_projv_projr   softplusrM   rL   clonedetachtrilrK   rd   Tr[   )rN   rj   r   r_softplus_0lower_causal_maskupper_causal_maskr   rO   s          r3   rH   zGemma3nAudioAttention.__init__6  s   =;2(DN:+?"&+"J #At{'NQR'R S S)-)M& Od.CCdF]]+PQW+X+X(\%+t}6F*G*GHHi 0$.4=2PW\]]]i 0$.4=2PW\]]]i 0$.4=2PW\]]]-%UX099%,s:K:KLLLY<)?(F(F(H(H(O(O(Q(Q^cddd!JJ)4?;5:NNN
 
 
  	 "JJ):;5:NNN*T-DD
 
 
 #(*dot?P-QY^Yc"d"d"d"9<M"MPa"a68O\abbbL788>>@@ 	 	
 	
 	
 	
 	
r2   rV   pad_left	pad_rightrX   c                     |j         ^}}}|                    ||g|R           }|                    ||g|R           }t          j        |||gd          }|S )Nr    r   )r`   	new_zerosr.   r   )	rN   rV   r   r   batchr   
tail_shapeleftrights	            r3   	_pad_dim1zGemma3nAudioAttention._pad_dim1a  sl     !q:{{E89j99::UI;
;;<<ItQ&A...r2   r9   c                 $   |j         }|dd         \  }}|| j        z   dz
  | j        z  }|| j        z  |z
  x}dk    r|                     |d|          }||| j        f|dd         z   }|                    |                                          }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr   r    r   )r`   r   r   r   
contiguous)rN   r9   r`   bt
num_blockspadding_lenpermute_dimss           r3   _convert_to_blockz'Gemma3nAudioAttention._convert_to_blockh  s     #RaRy1$/)A-$/A
%7!;;Kq@@ NN=![IIM:t7%)C%--l;;FFHHr2   c                 0   | j         }| j        | j        z   dz
  }|                     |||          }| j        }| j        }|                    d||          }|j        dk    r"|j        dk    rt          j        |dd          }|	                                S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r    )	dimensionsizestepr   r   rQ   )sourcedestination)
r   r   r   r   r   unfoldndimr.   movedimr   )rN   r9   r   r   	frame_len
frame_step
x_unfoldeds          r3   _extract_block_contextz,Gemma3nAudioAttention._extract_block_context~  s     ( +do=A	}h	JJ%	_
 #))AIJ)WW
 !!jo&9&9 z"!LLLJ$$&&&r2   maskc                    g |j         d d         | j        | j        R }|                     |                              |                                          }|                     |                              |                                          }|                     |                              |                                          }t          j	        j
                            | j                  }ddd| j        f}|                    |          }	|| j        z  |	z  }|j         d d         \  }
}|                     |          }|                     |          }|                     |          }|j         d         }| }|                     |          }|j        dk    r@|j         d         |j         d         z  | j        k    r|                    |
|| j                  }|j         |
|| j        fk    r&t'          d|j          d|
 d| d| j         d		          |                    d                              d
          }| j                            d                              d                              d          }t          j        ||                    |j                            }|                     ||          }| j                            |j                  }||z  }t          j        |          }||z  }t          j        ||t          j        |j                  j                  }t          j	        j
                             |dt          j!                                      |j                  }|j         \  }}}}}|j         d         }|"                    ddddd                              d||          }|"                    ddddd                              d||          }t          j#        ||          } |                     |||||          "                    ddddd          }!|!                    |
|| j$        z  | j        | j        f          }!|!d d d |f         }!|!S )NrQ   r    r   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   r?   r   r   )%r`   rp   rs   r   r   r   r   r   r.   rI   r   r   r   viewr   r   r   r   r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rN   r9   r   	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer7   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r3   r^   zGemma3nAudioAttention.forward  s   Nm)#2#.NNNN	{{=1199)DDOOQQ[[//77	BBMMOO
{{=1199)DDOOQQ 8.778JKKaDM2%5%:%:?%K%K"#dl25OO)/3
F--l;;00<<
22<@@'-a0  $e '+&A&ABU&V&V# (,11+1!47R7XYZ7[[_c_ppp*E*M*M,d.?+ +' ',1
 
 

 V/5V V@JV V$V V(,(9V V V   )D(M(Ma(P(P(Z(Z[](^(^% $(#?#I#I!#L#L#V#VWX#Y#Y#c#cde#f#f 
 %*$5)$''(E(LMM%
 %
! 11,
KK loofm44+%F##+% 6FL@Y@Y@]^^+33F%-3XX[[bnbt[uu -:,?)ueUE"2& ((Aq!Q77??E5QQ$$Q1a33;;BuMMYx//
$,,UE5%OOWWXY[\^_abdeff)11 4?2	
 
 *!!!WfW*5r2   )r*   r+   r,   r!   rH   r.   re   rc   r   r   r   
BoolTensorr^   rf   rg   s   @r3   r   r   5  s       )
1 )
 )
 )
 )
 )
 )
V5< 3 3 5<    u|     ,.'EL .'U\ .' .' .' .'`dU\ d9I del d d d d d d d dr2   r   c                   d     e Zd ZdZ	 d
dedee         def fdZdej	        dej	        fd	Z
 xZS )Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    MbP?num_channelsfeature_dimsr@   c           	      V   t                                                       || _        t          |          | _        || _        t          j        t          j	        |                    | _
        t          t          ddt          | j                  z   dz                       | _        d S )Nr   r    )rG   rH   r"  r;   r#  r@   rI   rJ   r.   rK   rC   rangelenreduction_axes)rN   r"  r#  r@   rO   s       r3   rH   z(Gemma3nAudioCumulativeGroupNorm.__init__&  s     	(!,// l5:l#;#;<< $E!QT5F1G1G-G!-K$L$LMMr2   r9   rX   c                    | j         | j        fz   }|j        dd         |k    r"t          d|j        dd          d|           |j        }t
          j        }|                    |          }t          j        ||          }t          j	        || j
        d          }t          j        |d	          }t          j	        || j
        d          }	t          j        |	d	          }
t          j        |
d
          }||z  }||z
                      d          }t          j	        || j
        d          }t          j        |d	          }||z  }||z
  t          j        || j        z             z  }| j                            |          }dg|                                dz
  z  | j        gz   }||                    |          z  }||z  }|                    |          S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   Tr?   rR   r    r   rD   )r   )r#  r"  r`   r   r   r.   r   r   	ones_likesumr'  cumsumclamprT   rsqrtr@   rC   r?   r   )rN   r9   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r3   r^   z'Gemma3nAudioCumulativeGroupNorm.forward8  s     !% 1T5F4H Hqrr"&;;;Q]-@-D Q Q9NQ Q  
 $)]
!!*-- OF*===	  )F0CTRRRo1=== "'9$:MW[!\!\!\"\*@aHHH"'+.@c"J"J"J "$;;
 #)8"3!8!8!;!; 9%;AT^bccc  ,'7Q??? ')@@ )U[9P-Q-QQ z**3-"3"3"5"5"9:d>O=PP#ejj1A&B&BB $i/{+++r2   )r!  )r*   r+   r,   r-   rc   r   r[   rH   r.   re   r^   rf   rg   s   @r3   r   r     s         ( 	N NN smN 	N N N N N N$G,U\ G,el G, G, G, G, G, G, G, G,r2   r   c                   p     e Zd ZdZ	 ddedededeeeeef         f fdZdej	        d	ej	        fd
Z
 xZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    r   r   r   r   rj   idxinput_freq_dimmanual_paddingc                 "   t                                                       || _        || _        |dk    rdn| j        j        |dz
           }| j        j        |         }| j        j        |         \  }}| j        j        |         \  }	}
t          j        ||||f|	|
fdd          | _	        || j        d         z   | j        d         z   }||z
  |
z  dz   }t          ||f| j        j                  | _        t          j                    | _        d S )Nr   r    )r   r   F)in_channelsout_channelskernel_sizestridepaddingrm   )r"  r#  r@   )rG   rH   rj   rG  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerI   Conv2dconvr   sscp_conv_group_norm_epsnormReLU
activation)rN   rj   rE  rF  rG  rI  rJ  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrO   s                r3   rH   z"Gemma3nAudioSSCPConvBlock.__init__  s+    	, !88aa)KCRSG)T{9#>![>sC(![>sC(I#% h'

 

 

	 %t':1'==@STU@VV!H,9A=
3%$4
 
 
	 '))r2   audio_encodingsrX   c                    t          j        || j        dd                              | j        j        j                  }|                     |          }|                    dddd                                          }| 	                    |          }|                    dddd                                          }| 
                    |          S )Nconstantr   )modevaluer   r   r   r    )Fr   rG  r   rR  rC   r   r   r   rT  rV  )rN   r]  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r3   r^   z!Gemma3nAudioSSCPConvBlock.forward  s     "#8KR\dg!h!h!h!k!kI""
 "

  $yy)?@@ *11!Q1==HHJJ
99Z((!)!1!1!Q1!=!=!H!H!J!J5666r2   )rD  )r*   r+   r,   r-   r!   rc   r;   rH   r.   re   r^   rf   rg   s   @r3   rC  rC    s          5A)$ )$")$ )$ 	)$
 c3S01)$ )$ )$ )$ )$ )$V7u| 7 7 7 7 7 7 7 7 7r2   rC  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )#Gemma3nAudioSubSampleConvProjectionrj   c                    t                                                       || _        |j        }g }g }t	          d          D ]r}|j        |         \  }}|j        |         \  }}	d}
|dz
  }d}d}|||
|f}|                    |           ||z   |z   }||z
  |	z  dz   }|                    |           |}st          d|j        ||d                   | _	        t          d|d         ||d                   | _
        |j        d         }|d         }||z  | _        t          j        | j        | j        j        d          | _        d S )Nr   r   r    )rE  rF  rj   rG  rQ   Frl   )rG   rH   rj   input_feat_sizer%  rO  rP  appendrC  conv_0conv_1rN  input_proj_in_featuresrI   ry   rq   input_proj_linear)rN   rj   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsirW  rX  rY  rZ  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tupler[  f_out_after_convfinal_c_outfinal_f_outrO   s                      r3   rH   z,Gemma3nAudioSubSampleConvProjection.__init__  s   $*$:!#%  "q 	9 	9A!'!=a!@Hh!'!=a!@Hh I#a<L JK 	$  %++,@AAA 4j@;NK +h 68CaG!(()9:::(8%%/!13A6	
 
 
 0033A6	
 
 
 3B7+B/&1K&?#!#4+FH_fk!l!l!lr2   r]  rX   c                 N   |                     d          }|                     |          }|                     |          }|j        \  }}}}|                    dddd                                          }|                    ||||z            }	|                     |	          }
|
S )Nr    r   r   r   )r   rm  rn  r`   r   r   r   rp  )rN   r]  audio_encodings_reshapedrV   r   c_outt_outf_out
x_permutedoutput_flattenedr]   s              r3   r^   z+Gemma3nAudioSubSampleConvProjection.forward  s     $3#<#<Q#?#? KK011KKNN!"5%YYq!Q**5577
%??1eUU]CC''(899r2   	r*   r+   r,   r!   rH   r.   re   r^   rf   rg   s   @r3   ri  ri    ss        7m1 7m 7m 7m 7m 7m 7mru|         r2   ri  c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZ	S )Gemma3nAudioConformerAttentionrj   c                    t                                                       || _        | j        j        | _        |                     dt          j        | j        j                  d           t          | j        j                  | _
        t          |          | _        t          j        | j        | j        j        d          | _        t          | j        j                  | _        d S )Ngradient_clippingFrE   rl   )rG   rH   rj   rq   post_in_featuresrL   r.   rM   r  r=   pre_attn_normr   attnrI   ry   post	post_normrN   rj   rO   s     r3   rH   z'Gemma3nAudioConformerAttention.__init__  s     $ 70%,t{?\2]2]joppp+DK,CDD)&11	Id3T[5LSXYYY	'(?@@r2   r]  audio_mel_maskrX   c                    |}t          j        || j         | j                  }|                     |          }|                     ||          }|j        \  }}}}	|                    ||||	z            }
|                     |
          }t          j        || j         | j                  }||                     |          z   S rZ   )	r.   r-  r  r  r  r`   r   r  r  )rN   r]  r  audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rp   rs   r~  s              r3   r^   z&Gemma3nAudioConformerAttention.forward  s    (7%+o8N7NPTPfgg#11/BB#'99-A>#R#R  %=$B!1i#;#C#CAq)V^J^#_#_ ))$<==+o8N7NPTPfgg,t~~o/N/NNNr2   
r*   r+   r,   r!   rH   r.   re   r  r^   rf   rg   s   @r3   r  r    s        A1 A A A A A AOu| OUEU OZ_Zf O O O O O O O Or2   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS ) Gemma3nAudioConformerFeedForwardrj   c                 H   t                                                       || _        |                     dt	          j        | j        j                  d           t          | j        j                  | _	        t          j        | j        j        | j        j        dz  d          | _        t          j        | j        j        dz  | j        j        d          | _        t          | j        j                  | _        t	          j        | j        j                  | _        d S )Nr  FrE   r   rl   )rG   rH   rj   rL   r.   rM   r  r=   rq   pre_layer_normrI   ry   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r3   rH   z)Gemma3nAudioConformerFeedForward.__init__-  s    0%,t{?\2]2]joppp,T[-DEE9T[%<dk>UXY>Y`efff9T[%<q%@$+BY`efff-dk.EFF %T[-M N Nr2   r]  rX   c                    |}t          j        || j         | j                  }|                     |          }|                     |          }t
          j                            |          }|                     |          }t          j        || j         | j                  }| 	                    |          }||| j
        z  z   S rZ   )r.   r-  r  r  r  rI   r   silur  r  r  )rN   r]  residuals      r3   r^   z(Gemma3nAudioConformerFeedForward.forward9  s    "+o8N7NPTPfgg--o>>(,(8(8(I(I-,,_==(,(8(8(I(I+o8N7NPTPfgg..???T-BBCCr2   r  rg   s   @r3   r  r  ,  s|        
O1 
O 
O 
O 
O 
O 
O	Du| 	D 	D 	D 	D 	D 	D 	D 	D 	Dr2   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS ) Gemma3nAudioConformerLightConv1drj   c           	         t                                                       || _        t          | j        j        | j        j                  | _        t          j        | j        j        | j        j        dz  d          | _	        t          j
        | j        j        | j        j        | j        j        dd| j        j        d          | _        |                     dt          j        | j        j                  d	           t          | j        j        | j        j                  | _        t          j        | j        j        | j        j        d          | _        | j        j        dz
  | _        d S )
Nr@   r   Frl   r    r   )rI  rJ  rK  rL  rM  groupsrm   r  rE   )rG   rH   rj   r=   rq   rms_norm_epsr  rI   ry   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drL   r.   rM   r  	conv_norm
linear_endcausal_paddingr  s     r3   rH   z)Gemma3nAudioConformerLightConv1d.__init__F  s2   ,T[-D$+JbcccIdk&=t{?VYZ?Zafggg "	/09;*!
 !
 !
 	0%,t{?\2]2]joppp'(?T[E]^^^)DK$;T[=T[`aaa"k?!Cr2   r]  rX   c                 F   |}|                      |          }|                     |          }t          j        j                            |d          }|                    ddd          }t          j        || j	        df          }| 
                    |          }|                    ddd          }t          j        || j         | j                  }|                     |          }t          j                            |          }|                     |          }||z   }|S )NrQ   r   r   r   r    )r  r  r.   rI   r   glur   rb  r   r  r  r-  r  r  r  r  )rN   r]  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr]   s         r3   r^   z(Gemma3nAudioConformerLightConv1d.forward[  s   #2 --o>>++O<<(-11/r1JJ#2#:#:1a#C#C *+%0H4K^`aJb*c*c'//0OPP)11!Q::+o8N7NPTPfgg..99-,,_==///:: #;;r2   r  rg   s   @r3   r  r  E  sr        D1 D D D D D D*u|         r2   r  c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZ	S )Gemma3nAudioConformerBlockrj   c                    t                                                       || _        t          | j                  | _        t          | j                  | _        t          | j                  | _        t          | j                  | _	        | 
                    dt          j        | j        j                  d           t          | j        j                  | _        d S )Nr  FrE   )rG   rH   rj   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endrL   r.   rM   r  r=   rq   rT  r  s     r3   rH   z#Gemma3nAudioConformerBlock.__init__q  s    ?LL7DD7DD=dkJJ0%,t{?\2]2]joppp"4;#:;;			r2   r]  r  rX   c                    |                      |          }|                     ||          }| }||                    d                              |j                  z  }|                     |          }|                     |          }t          j        || j	         | j	                  }| 
                    |          }|S )NrQ   )r  r  r   r   r   r  r  r.   r-  r  rT  )rN   r]  r  validity_mask_for_lconvaudio_encodings_for_lconv_inputr]   s         r3   r^   z"Gemma3nAudioConformerBlock.forward|  s    ..??...II#1/*9<S<]<]^`<a<a<d<d!=
 =
 +
' ,,'FGG,,_==+o8N7NPTPfgg?++r2   r  rg   s   @r3   r  r  p  sw        	<1 	< 	< 	< 	< 	< 	<u| UEU Z_Zf        r2   r  c                        e Zd ZU dZeed<   dZdef fdZdej	        dej
        deej	        ej
        f         fdZ xZS )Gemma3nAudioEncoderzx
    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
    rj   	audio_melc                     t                                                     | _        t                    | _        t          j        fdt          j                  D                       | _	        d S )Nc                 .    g | ]}t                    S r1   )r  ).0r   rj   s     r3   
<listcomp>z0Gemma3nAudioEncoder.__init__.<locals>.<listcomp>  s"    ^^^A'//^^^r2   )
rG   rH   rj   ri  subsample_conv_projectionrI   
ModuleListr%  conf_num_hidden_layers	conformerr  s    `r3   rH   zGemma3nAudioEncoder.__init__  sl       )LV)T)T&^^^^v?\9]9]^^^
 
r2   r  rX   c                    |                      |          }|j        d         }d}t          t          | j        j                            D ]}|| j        j        |         d         z  }t          j        ||j                  |z  }t          j	        ||j        d         dz
            }|j
        dk    r@|j
        dk    r5|                    d                              |j        d         d          }nX|j
        |j
        k    rH|j        d         dk    r7|j        d         dk    r&||j        d         k    r|                    d          }t          j        |d|          }| j        D ]}	 |	||          }| j        j        dk    r2|dddd| j        j        f         }|dddd| j        j        f         }|                    |                    d          d          }||fS )a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r    r   r   )rt   rQ   Nr   )r  r`   r%  r&  rj   rP  r.   r~   r   r-  r   r   expandgatherr  conf_reduction_factormasked_fill)
rN   r  r  r]  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks
             r3   r^   zGemma3nAudioEncoder.forward  s    88CC  %a($S)J%K%KLL 	Y 	YO4;#D_#UVW#XX
 ,u^-BCCCFYY+g>+?+BQ+FGGG ""w|q'8'8''**11.2Fq2I2NNGG7<//$Q'1,,a A%%q))) ''**G|NAw??^ 	C 	CE#eO\BBOO;,q00-aaa1U1UDK4U1U.UVO'+O+Odk.O+O(OPL)55l6L6LR6P6PRUVV,,r2   )r*   r+   r,   r-   r!   r0   main_input_namerH   r.   re   r  r;   r^   rf   rg   s   @r3   r  r    s           !O
1 
 
 
 
 
 
5-5-7<7G5-	u|U--	.5- 5- 5- 5- 5- 5- 5- 5-r2   r  c            	       P     e Zd ZdZd
dedededef fdZdej        f fd	Z	 xZ
S )Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    rD   num_embeddingsembedding_dimpadding_idxembed_scalec                     t                                          |||           |                     dt          j        |          d           d S )Nr  FrE   )rG   rH   rL   r.   rM   )rN   r  r  r  r  rO   s        r3   rH   z'Gemma3nTextScaledWordEmbedding.__init__  sK    DDD]EL,E,ERWXXXXXr2   	input_idsc                     t                                          |          | j                            | j        j                  z  S rZ   )rG   r^   r  r   rC   r   )rN   r  rO   s     r3   r^   z&Gemma3nTextScaledWordEmbedding.forward  s4    wwy))D,<,?,?@Q,R,RRRr2   )rD   )r*   r+   r,   r-   rc   r[   rH   r.   re   r^   rf   rg   s   @r3   r  r    s         Y Ys Y3 YS Y_d Y Y Y Y Y YS S S S S S S S S S Sr2   r  c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )Gemma3nTextLaurelBlockz Learned Augmented Residual Layerrj   c                 j   t                                                       || _        t          j        | j        j        | j        j        d          | _        t          j        | j        j        | j        j        d          | _        t          | j        j        | j        j
                  | _        d S )NFrl   r  )rG   rH   rj   rI   ry   rq   laurel_ranklinear_leftlinear_rightr=   r  post_laurel_normr  s     r3   rH   zGemma3nTextLaurelBlock.__init__  s    9T[%<dk>U\abbbIdk&=t{?V]bccc .t{/FDKLd e e er2   r9   rX   c                     |                      |          }|                     |          }|                     |          }||z   S rZ   )r  r  r  )rN   r9   laurel_hidden_statesnormed_laurel_hidden_statess       r3   r^   zGemma3nTextLaurelBlock.forward  sL    -1-=-=m-L-L-1->->?S-T-T&*&;&;<P&Q&Q#:::r2   )
r*   r+   r,   r-   r#   rH   r.   re   r^   rf   rg   s   @r3   r  r    sx        **f0 f f f f f f;U\ ;el ; ; ; ; ; ; ; ;r2   r  c                   r     e Zd Zd
dedef fdZdej        dej        fdZdej        dej        fd	Z	 xZ
S )Gemma3nTextMLPr   rj   	layer_idxc                    t                                                       || _        |j        | _        |j        |         | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        |j        |         | _        d S NFrl   )rG   rH   rj   rq   intermediate_sizerI   ry   	gate_projup_proj	down_projr	   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrN   rj   r  rO   s      r3   rH   zGemma3nTextMLP.__init__  s    !-!'!9)!D4#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV56#)#Ei#P   r2   r9   rX   c                     |                      |          }| j        dk    r|                     |          }|                     |          }|                     |          }|                     ||z            }|S )Nr   )r  r  _gaussian_topkr  r  r  )rN   r9   r  activationsr  r  s         r3   r^   zGemma3nTextMLP.forward   sr    NN=11	#c))++I66Ikk),,,,}--NN;#899	r2   inputsc                    t          j        | j        t           j        |j                  }t           j        j                            dd          }|                    |          }|	                    |j
                  }t          j        |dd          }t          j        |ddd          }|||z  z   }t          j                            ||z
            S )	Nr   r   r   r    rQ   Tr)  F)r?   rR   unbiased)r.   rM   r  r   r   distributionsnormalNormalicdfr   r   rU   stdrI   r   relu)rN   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r3   r  zGemma3nTextMLP._gaussian_topk	  s    !&d.Femdjdq!r!r!r )0771=='2'7'78N'O'O',,V\::jR>>>Yv2teLLL
n!<<}!!&8"3444r2   )r   )r*   r+   r,   r#   rc   rH   r.   re   r^   r  rf   rg   s   @r3   r  r    s        	Q 	Q0 	QS 	Q 	Q 	Q 	Q 	Q 	QU\ el    5U\ 5el 5 5 5 5 5 5 5 5r2   r  c                        e Zd ZdZdef fdZdej        dej        fdZdej        dej        fdZ	d	ej        d
ej        dej        fdZ
dej        dej        fdZdej        dej        fdZ xZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    rj   c                    t                                                       || _        t          j        t          j        | j        j                            | _        t          j	        | j        j
        | j        j
        d          | _        t          j	        | j        j
        | j        j
        dz  d          | _        t          j	        | j        j        | j        j
        d          | _        t          | j        j        | j        j                  | _        |                     dt          j        | j        j        dz            d           d S )NFrl   r   r  router_input_scaleg      rE   )rG   rH   rj   rI   rJ   r.   r   rq   correct_output_scalery   altup_num_inputscorrection_coefsprediction_coefsmodality_routerr=   r  router_normrL   rM   r  s     r3   rH   zGemma3nTextAltUp.__init__&  s
   $&LT[=T1U1U$V$V! "	$+*FHdkp q q q "	$+*FHdfgHgns t t t!y)@$+B^ejkkk)$+*At{G_```15<@WY]@]3^3^kpqqqqqr2   rV   rX   c                     |                      |          | j        z  }|                     |          }t          j        |                                                              |          S rZ   )r  r  r  r.   r   r[   r\   )rN   rV   router_inputsrouteds       r3   compute_router_modalitiesz*Gemma3nTextAltUp.compute_router_modalities0  sV    ((++d.EE%%m44z&,,..))11!444r2   r9   c                 z   |                      || j        j                           }| j        rF| j        j        :| j        j        j                            | j        j         | j        j                    |                     |          j	        g |j
        dd         | j        j        | j        j        R                      dddd          }t          j        |                    dddd          |          }|                    dddd          }||z  }|                                                    |          S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrQ   r   r    r   r   )r  rj   altup_active_idxtrainingaltup_coef_clipr  rC   dataclamp_r   r`   r  r   r.   r   r   r\   )rN   r9   
modalities	all_coefspredictionss        r3   predictzGemma3nTextAltUp.predict5  sA    33M$+B^4_``
= 	pT[8D!(-44dk6Q5QSWS^SnoooD!!*--i &ss+i-1[-IiKO;Kgi i iWQ1a   	 l=#8#8Aq!#D#DiPP!))!Q155}$%%''//>>>r2   r  	activatedc                 ,   |                      |          }||| j        j                 z
  }|                    | j        j        ddd          }| j        j        :| j        j        j        	                    | j        j         | j        j                   |                     |          dz   }|
                    ddd                              d          }t          j        ||          }||z  }|                                                    |          S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r    NrD   r   r   rQ   )r  rj   r  repeatr  r  r  rC   r  r  r   r   r.   mulr   r\   )rN   r  r   r  
innovationr  	correcteds          r3   correctzGemma3nTextAltUp.correctQ  s     33I>>
T[-I!JJ
&&t{'CQ1MM
;&2!(-44dk6Q5QSWS^Snooo
 #'"7"7
"C"Cc"I	%%aA..88<<	Ij)44	[ 	##%%--i888r2   r%  c                 l    |                     | j                  | j        z                       |          S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )r\   r  rN   r%  s     r3   r^   zGemma3nTextAltUp.forwardn  s2     !!$";<<t?XXaabklllr2   c                 ,    |                      |          S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)r^   r(  s     r3   scale_corrected_outputz'Gemma3nTextAltUp.scale_corrected_outputv  s    ||I&&&r2   )r*   r+   r,   r-   r#   rH   r.   re   r  r  r&  r^   r*  rf   rg   s   @r3   r	  r	    s#       	 	r0 r r r r r r55< 5EL 5 5 5 5
?U\ ?el ? ? ? ?895< 9EL 9U\ 9 9 9 9:m m%, m m m m' ' ' ' ' ' ' ' ' 'r2   r	  c                   |     e Zd ZU ej        ed<   ddef fdZ ej                    e	d                         Z
 xZS )Gemma3nTextRotaryEmbeddinginv_freqNrj   c                    t                                                       t          |d          rSt          |j        t
                    r9|j                            d|j                            d                    | _        nd| _        |j        | _	        |j        | _
        || _        t          | j                 | _        |                     | j        |          \  }| _        |                     d|d           | j        | _        d S )Nrope_scaling	rope_typer   defaultr-  FrE   )rG   rH   hasattr
isinstancer/  dictgetr0  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrj   r   rope_init_fnattention_scalingrL   r-  original_inv_freq)rN   rj   r   r-  rO   s       r3   rH   z#Gemma3nTextRotaryEmbedding.__init__~  s    6>** 	'z&:Mt/T/T 	'#044[&BUBYBYZ`BaBabbDNN&DN"("@$*$B!/?+/+<+<T[&+Q+Q($(ZeDDD!%r2   c                 X   | j         d d d d f                                                             |j        d         dd                              |j                  }|d d d d d f                                         }t          |j        j        t                    r|j        j        dk    r|j        j        nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                | j        z  }|                                | j        z  }	d d d            n# 1 swxY w Y   |                    |j        
          |	                    |j        
          fS )Nr   rQ   r    mpscpuF)device_typeenabledr   r   r   )r-  r[   r  r`   r   r   r3  r   strr.   autocast	transposer   r   r:  r   r   )
rN   rV   position_idsinv_freq_expandedposition_ids_expandedr?  freqsembr   r   s
             r3   r^   z"Gemma3nTextRotaryEmbedding.forward  s    !M$4-8>>@@GGHZ[\H]_acdeehhijiqrr ,QQQaaaZ 8 > > @ @'1!(-'E'Ek!(-[`J`J`ahmmfk^UCCC 	5 	5&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))d44C''))d44C		5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 vvAGv$$cff17f&;&;;;s   BE++E/2E/rZ   )r*   r+   r,   r.   re   r0   r#   rH   no_gradr   r^   rf   rg   s   @r3   r,  r,  {  s         l/ /0 / / / / / /" U]__< <  _< < < < <r2   r,  c                     | dd| j         d         dz  f         }| d| j         d         dz  df         }t          j        | |fd          S )z*Rotates half the hidden dims of the input..NrQ   r   r   )r`   r.   r   )rV   x1x2s      r3   rotate_halfrM    s]    	
3"!'"+"""	#B	
3q """	#B9rc2YB''''r2   r9   n_reprX   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)r`   r  r   )r9   rN  r   num_key_value_headsslenrs   s         r3   	repeat_kvrR    s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr2   r   modulequerykeyra  attention_maskdropoutscalingr   c                    |
| j         dz  }t          || j                  }	t          || j                  }
t          j        ||	                    dd                    |z  }|||z  }t          j        |          }||z  }|$|d d d d d d d |	j        d         f         }||z   }t          j	        
                    |dt          j                                      |j                  }t          j	                            ||| j                  }t          j        ||
          }|                    dd                                          }||fS )	Nr   r   r   r   rQ   r   )pr  r    )rs   rR  num_key_value_groupsr.   r   rC  r   r`   rI   r   r   r   r   r   rW  r  r   )rS  rT  rU  ra  rV  rW  rX  r   kwargsr  r  attn_weightscausal_maskattn_outputs                 r3   eager_attention_forwardr`    sR    /4'3 ;<<JUF$?@@L<z';';Aq'A'ABBWLL#g-z,//#g-!$QQQ111.D
0@0D.D%DE#k1 =((2U](SSVVW\WbccL=((6?([[L,|\::K''1--88::K$$r2   rV   r   r   rD  unsqueeze_dimc                     |                     |          }|                     |          }| |z  t          |           |z  z   S )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   rM  )rV   r   r   rD  ra  s        r3   apply_rotary_pos_embrc    s@    2 --
&
&C
--
&
&CGA,--r2   c                   *    e Zd ZdZdedef fdZ eddd          	 	 dd
ej	        dej	        de
ej	                 de
e         de
ej                 dee         deej	        e
ej	                 e
eej	                          f         fd            Z xZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrj   r  c                 f   t                                                       |j        |         dk    | _        || _        || _        t          |d|j        |j        z            | _	        |j        |j
        z  | _        | j        j        | _        d| _        t          j        |j        |j        | j	        z  |j                  | _        t          j        |j        |j
        | j	        z  |j                  | _        t          j        |j        |j
        | j	        z  |j                  | _        t          j        |j        | j	        z  |j        |j                  | _        | j        r|j        nd | _        t-          |j	        |j                  | _        t-          |j	        |j                  | _        t-          |j	        |j        d          | _        | j        j        | j        j        z
  }||cxk    odk    nc | _        |j        d |         }| j        rJt=          |          d	z
  |d d d
                             |j        |                   z
  | _         d| _!        d S d | _         |t=          |          d	z
  |d d d
                             |j        |                   z
  k    | _!        d S )Nsliding_attentionrs   Trl   )r?   r@   F)r?   r@   rA   r   r    rQ   )"rG   rH   layer_types
is_slidingrj   r  getattrrq   num_attention_headsrs   rP  r[  attention_dropout	is_causalrI   ry   attention_biasr   r   r   o_projsliding_windowr=   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerr&  indexkv_shared_layer_indexstore_full_length_kv)rN   rj   r  first_kv_shared_layer_idxprev_layersrO   s        r3   rH   zGemma3nTextAttention.__init__  s    ,Y7;NN"
F4F&Jd4dee$*$>&B\$\!!%!>i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i :T] JQWQf
 
 
 i&68JQWQf
 
 
 8<Pf33D$f>QRRR$f>QRRR$f>Q^cddd$(K$ADKDd$d!"+/H"L"L"L"L1"L"L"L"L()C*C)CD" 		),[)9)9A)=DDbD@Q@W@WX^XjktXu@v@v)vD&(-D%%%)-D&(1S5E5E5IKX\X\Z\X\L]LcLc"9-M M 6 )D%%%r2   past_key_valuer8   4.58new_nameversionNr9   position_embeddingsrV  cache_positionr\  rX   c                    |j         d d         }g |d| j        j        R }|\  }	}
|                     |                              |          }|                     |          }t          ||	|
d          }|                    dd          }| j        rL|J|j	        | j
                 \  }}|                    |j                  }|                    |j                  }n|                     |                              |          }|                     |          }t          ||	|
d          }|                    dd          }|                     |                              |          }|                     |          }|                    dd          }|b|
|	|| j        d}| j        s |                    ||| j        |          \  }}| j        r(t+          |d          si |_	        ||f|j	        | j        <   t,          }| j        j        dk    rt0          | j        j                 } || ||||f| j        r| j        ndd	| j        d
|\  }} |j        g |dR                                  }|                     |          }||fS )NrQ   r   )ra  r    )r   r   r  rp  shared_layerseagerr   rD   )rW  rX  rp  )r`   rj   rs   r   r   rq  rc  rC  rv  r  rx  r   r   r   rr  r   rs  rp  updater  ry  r2  r`  _attn_implementationr   r  rl  r   r   ro  )rN   r9   r  rV  r8   r  r\  input_shapehidden_shaper   r   r  r  r  cache_kwargsattention_interfacer_  r]  s                     r3   r^   zGemma3nTextAttention.forward   s    $)#2#.??b?$+*>??&S{{=1166|DD{{<00+L#sRSTTT#--a33 " 	8'B'6'DTE_'`$J#|':;;J'??<+>??LL]3388FFJZ00J-j#sRSTTTJ#--a33J;;}55::<HHL;;|44L'11!Q77L& "0"&"5	 L * +:+A+Adnl, ,(
L ( Y@@ 746O1@JL@X-dn=(?;+w66"9$+:Z"[$7$7
%
 /3mDD**.
%
 
%
 
%
 
%
!\ *k);;;;;;FFHHkk+..L((r2   NN)r*   r+   r,   r-   r#   rc   rH   r   r.   re   r   r
   
LongTensorr   r   r;   r^   rf   rg   s   @r3   re  re    s'       GG(0 (S ( ( ( ( ( (T _%0A6RRR ,059F) F)|F) #\F) !.	F)
 "%F) !!12F) -.F) 
u|Xel3XeEL>Q5RR	SF) F) F) SRF) F) F) F) F)r2   re  c                   j    e Zd Zdedef fdZ eddd          	 	 	 	 	 	 dd
ej        dej        dej        dej        de	ej                 de	ej
                 de	e         de	e         de	e         de	ej
                 deej        e	eej        ej        f                  f         fd            Z xZS )Gemma3nTextDecoderLayerrj   r  c                 d   t                                                       || _        |j        | _        || _        |j        |         | _        t          ||          | _        t          ||          | _
        t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        |j        | _        t$          |j                 | _        t+          |          | _        t/          |          | _        t3          j        | j        | j        d          | _        t3          j        | j        | j        d          | _        t          | j        |j                  | _        d S )N)r  r  Frl   )rG   rH   rj   rq   r  rh  attention_typere  	self_attnr  mlpr=   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr	   r  r  r	  altupr  laurelrI   ry   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  s      r3   rH   z Gemma3nTextDecoderLayer.__init__k  sm   !-"$0;-fi@@!&I>>>-d.>FDWXXX(6t7GVM`(a(a(a%)78HfNa)b)b)b&*89IvOb*c*c*c'+1+M(V56%f--
,V44$&Id.>@`gl$m$m$m!$&Id.NPTP`gl$m$m$m!)78HfNa)b)b)b&&&r2   r|  r8   r}  r~  NFr9   position_embeddings_globalposition_embeddings_localper_layer_inputrV  rD  output_attentions	use_cacher  rX   c                    | j                             |          }|| j        j                 }|                     |          }|                     |          }| j        j        r|}n|} | j        d|||||||	|
d|\  }}|                     |          }||z   }||z   t          j
        d          z  }|                     |          }|                     |          }|                     |          }||z   }| j                             ||          }|| j        j                                                 }| j        j        r| j                             |          }|                     |          }|                     |          }t)          j        ||          }|                     |          }|                     |          }|dd xx         |z  cc<   |f}|r||fz  }|S )N)r9   r  rV  rD  r8   r  r  r  r   r    r1   )r  r  rj   r  r  r  r  ri  r  r{   rS   r  r  r  r&  r   altup_correct_scaler*  r  r  r.   multiplyr  r  )rN   r9   r  r  r  rV  rD  r8   r  r  r  r\  r  active_predictionactive_prediction_normedlaurel_outputr  r  self_attn_weights
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionoutputss                               r3   r^   zGemma3nTextDecoderLayer.forward  s2    j((77'(DE#'#7#78I#J#J $<== >$ 	=";"<"0$. 
#
2 3)%+/)
#
 
#
 
#
 
#
 ,,T22&-
!M1TYq\\A22;??	88I&&77AA +m ; $
 2 2;@U V V01MNTTVV;* 	S#z@@AQRR  445EFF;;'788 >*:OLL  445EFF99:JKKabb!!!%55!!!(* 	,)++Gr2   )NNNFFN)r*   r+   r,   r#   rc   rH   r   r.   re   r   r  r
   rd   r;   r/   r^   rf   rg   s   @r3   r  r  j  sn       c0 cS c c c c c c, _%0A6RRR 2637+/,1$)59C C|C %*LC $)<	C
 C !.C u/0C "%C $D>C D>C !!12C 
u|XeE,=u?P,P&QRR	SC C C SRC C C C Cr2   r  c                   \     e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZ fdZ xZS )Gemma3nPreTrainedModelrj    Tr  r8   )r9   r:   c                    t                                          |           t          |t                    r!|j        j                            d           d S t          |t                    r |j        j        	                                 d S t          |t                    r |j        j        	                                 d S d S )NrD   )rG   _init_weightsr3  r   rC   r  fill_r   r   zero_r	  r  )rN   rS  rO   s     r3   r  z$Gemma3nPreTrainedModel._init_weights  s    f%%%f=>> 	5M$$S))))) 566 	5 %++----- 011 	5',2244444	5 	5r2   )r*   r+   r,   r"   r0   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  re  _can_record_outputsr  rf   rg   s   @r3   r  r    s         &*#23#4"5N!"&0* 
5 5 5 5 5 5 5 5 5r2   r  zBThe base Gemma 3n language model without a language modeling head.c                       e Zd ZU eed<   def fdZee	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j                 dee	j
                 dee         d	ee	j                 d
ee         dee         dee         dee	j
                 dee         defd                        Zde	j
        de	j        fdZ	 dd	e	j        dee	j                 de	j        fdZ xZS )Gemma3nTextModelrj   c                 z    t                                                     j         _        j         _        t          j        j         j         j        j        dz             _        t          j
        fdt          j                  D                        _        t          j        j                   _        t#                     _        d _        t)          j                  j        _        ddi_        t#                     _        j         _        j         _        t          j        j        j        z   j        j        dz             _        t          j         j        j        j        z  d	           _        t          j        j                   _        t          j
         fd
t          d j        j                   D                        _!        t          j
         fdt          d j        j                   D                        _"         #                    dtI          j%         j        dz            d            #                    dtI          j&        tI          j%        d                    d            '                                 d S )N      ?)r  c                 0    g | ]}t          |          S r1   )r  )r  r  rj   s     r3   r  z-Gemma3nTextModel.__init__.<locals>.<listcomp>  s$    iiiI$VY77iiir2   r  rj   Fr0  r1  rl   c                 R    g | ]#}t          j        j        j        d           $S Frl   rI   ry   rq   r  r   rN   s     r3   r  z-Gemma3nTextModel.__init__.<locals>.<listcomp>  0    www1RYt')9FFFwwwr2   r    c                 R    g | ]#}t          j        j        j        d           $S r  r  r  s     r3   r  z-Gemma3nTextModel.__init__.<locals>.<listcomp>  r  r2   per_layer_projection_scaler   rE   per_layer_input_scaleg       @)(rG   rH   pad_token_idr  
vocab_sizer  rq   rj   embed_tokensrI   r  r%  rt  layersr=   r  rT  r,  
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar/  rotary_emb_localr  vocab_size_per_layer_inputembed_tokens_per_layerry   per_layer_model_projectionper_layer_projection_normr  altup_projectionsaltup_unembed_projectionsrL   r.   rM   r.  	post_initr  s   ``r3   rH   zGemma3nTextModel.__init__  s      !. + ;v143CQUQ\QhjmQm
 
 
 miiiivOgIhIhiii
 
 #6#56;NOOO	4FCCC&+#
 v&&"7*I6 :& I I I!-+1+M(&D-$v'II:C?	'
 '
 '
# +-)$v'II+
 +
 +
' *88Z`f`s)t)t)t&!#wwwwPUVWY]YdYuPvPvwww"
 "
 *,wwwwPUVWY]YdYuPvPvwww*
 *
& 	95<HXZ^H^;_;_lqrrr4ek%,sBSBS6T6Tafggg 	r2   Nr  per_layer_inputsrV  rD  r8   inputs_embedsr  r  output_hidden_statesr  r\  rX   c                 	   ||n| j         j        }|	|	n| j         j        }	||n| j         j        }|du |duz  rt	          d          | j        r%| j        r|rt                              d           d}|*| 	                    |          }| 
                    |          }|                     ||          }|r|| j        st          | j                   }|
B||                                nd}t          j        |||j        d         z   |j                  }
||
                    d          }t'          |x}t(                    s'| j         |||
||d	}t+          di |t-          di |d
}|}|                     ||          }|                     ||          }t          j        |dz  dd          dz  }t          j        d          }|g}t7          d| j         j                  D ]} | j        |dz
           |          }|                    |j        |j                  }t          j        |dz  dd          }t          j         t          j!        ||                    |j                                      }||z  |z  }|"                    |           t          j#        |d          }|	rdnd}|rdnd}| j$        d| j         j%                 D ]W}|	r||fz  }||j&                 }|dddd|j'        ddf         } |||||f||||||
d|}|d         }|r||d         fz  }X|	r||fz  }t          j        |d         dz  dd          dz  }|d         g}t7          d| j         j                  D ]} | j(        |dz
           ||                   } |                     |j        |j                  }t          j        |dz  dd          }t          j         t          j!        ||                    |j                                      }||z  |z  }|"                    |           t          j#        |          }t          j        |d          }| )                    |          }tU          ||||          S )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr  r   r    r   )rj   input_embedsrV  r  r8   rD  )full_attentionrg  r   rQ   Tr)  r  gh㈵>r  r   r1   )rV  rD  r8   r  r  r  )last_hidden_stater8   r9   r:   )+rj   r  r  r  r   r  r  loggerwarning_oncer  get_per_layer_inputsproject_per_layer_inputsr   get_seq_lengthr.   r~   r`   r   r   r3  r4  r   r   r  r  rU   rM   r%  r  r  r   r   rS   maximumrl  stackr  rt  r  r  r  rT  r   )!rN   r  r  rV  rD  r8   r  r  r  r  r  r\  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0r  r  target_magnitudeepsilon_tensortemp_hidden_statesrt  
altup_projcurrent_hidden_statenew_magnituder9   all_hidden_statesall_self_attnsdecoder_layerr^  r  layer_outputsaltup_unemb_projs!                                    r3   r^   zGemma3nTextModel.forward!  s   ( 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M#88CC88HXYY 	?00*$+>>>O!CRC^==???de"\  =#6q#99$+  N )33A66L ?-FF 	 + -"0"0#2 , K #5"C"C{"C"C%F%U%U%U%U# # ( &*___l%S%S"$($9$9/<$X$X! !:oq&8b$OOOSVVd++-.q$+677 	< 	<A6/A6GGJ#-==7LUeUl=#m#m !J';Q'>BPTUUUM!Ju}]NDUDUVfVmDnDn'o'oppM#7:J#J]#Z %%&:;;;;$6A>>> #7@BBD0:d![)H4;+H)HI 	6 	6M# 6!m%55!-m.JKK.qqq!!!]5Laaa/OPO)M*)	
  +) /"3#-   M *!,M  6=#3"55   	2-!11 !:mA&6!&;TRRRVYY+A./q$+677 	< 	<A-RT-KAPQE-RS`abSc-d-d#3#6#6_=R[k[r#6#s#s !J';Q'>BPTUUUM!Ju}]NDUDUVfVmDnDn'o'oppM#7:J#J]#Z %%&:;;;;$677
=a888		-00&+++%	
 
 
 	
r2   c                 r     |                      |          j        g |j        | j        j        | j        R  S rZ   )r  r   r`   rj   rt  r  )rN   r  s     r3   r  z%Gemma3nTextModel.get_per_layer_inputs  sP    =t**955= 
_
K)
 ,
 
 
 	
r2   c                    |                      |          }|| j                            |j        |j                  z  } |j        g |j        d d         | j        j        | j	        R  }| 
                    |          }||S |j        |j        k    r|dd | j        j        d d f         }||z   | j                            |j        |j                  z  S )Nr  rQ   .)r  r  r   r   r   r   r`   rj   rt  r  r  r  )rN   r  r  r  s       r3   r  z)Gemma3nTextModel.project_per_layer_inputs  s(   
 .2-L-L]-[-[ ? B B%.B.I !C !
 !
 	
  <3;  
 "% 
K) 
 , 
  
  

  $==>RSS#''%)9)???/5Tt{7T5TVWVWVW0WX$'774;U;X;X%.B.I <Y <
 <
 
 	
r2   )
NNNNNNNNNNrZ   )r*   r+   r,   r#   r0   rH   r   r   r   r.   r  re   r
   r/   rd   r   r   r   r^   r  r  rf   rg   s   @r3   r  r    s        70 7 7 7 7 7 7r  15371537+/59$(,0/359T
 T
E,-T
 #5<0T
 !.	T

 u/0T
 "%T
   12T
 D>T
 $D>T
 'tnT
 !!12T
 +,T
 
!T
 T
 T
 ^ T
l
e.> 
5< 
 
 
 
 48
 
|
 #5<0
 
	
 
 
 
 
 
 
 
r2   r  z?The base Gemma 3n language model with a language modeling head.c                       e Zd ZU dgZddiZddgdgfiZeed<   dZddiZ	def fd	Z
ee	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         deej                 deej                 dee         dee         dee         deej                 deeej        f         defd                        Z xZS )Gemma3nForCausalLMlm_head.weightlm_headcolwise_repr9   r7   rj   modelzmodel.language_modelc                     t                                          |           t          |          | _        |j        | _        t          j        |j        |j        d          | _        | 	                                 d S r  )
rG   rH   r  r	  r  rI   ry   rq   r  r  r  s     r3   rH   zGemma3nForCausalLM.__init__  sj       %f--
 +y!3V5FUSSS 	r2   Nr   r  rV  rD  r8   r  labelsr  r  r  r  logits_to_keeprX   c                    ||n| j         j        }|	|	n| j         j        }	 | j        d||||||||	|
d	|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }| j         j	        2|| j         j	        z  }t          j        |          }|| j         j	        z  }d}| | j        ||| j        fi |}t          |||j        |j        |j                  S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```N)	r  rV  rD  r8   r  r  r  r  r  )r6   r7   r8   r9   r:   r1   )rj   r  r  r	  r  r3  rc   slicer  final_logit_softcappingr.   r   loss_functionr  r   r8   r9   r:   )rN   r  rV  rD  r8   r  r  r  r  r  r  r  r\  r  r9   slice_indicesr7   r6   s                     r3   r^   zGemma3nForCausalLM.forward  sb   F 2C1N--TXT_Tq$8$D  $+Jj 	 ,64: ,
)%+'/!5),
 ,
 ,
 ,
  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA;.:dkAAFZ''FdkAAF%4%ffdoPPPPD%#3!/)
 
 
 	
r2   )NNNNNNNNNNr   )r*   r+   r,   _tied_weights_keys_tp_plan_pp_planr#   r0   r  _checkpoint_conversion_mappingrH   r   r   r   r.   r  re   r
   r/   rd   r   rc   r   r^   rf   rg   s   @r3   r  r    s        *+=)H_-z:;H&<g%F"0        151537+/59-1$(,0/35934F
 F
E,-F
 !.F
 u/0	F

 "%F
   12F
 )*F
 D>F
 $D>F
 'tnF
 !!12F
 c5</0F
 
 F
 F
 F
 ^ F
 F
 F
 F
 F
r2   r  c                        e Zd ZdZdeeef         def fdZ	 	 d
de	e
j                 de	e
j                 de
j        fd	Z xZS )Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        t          j
        | j        | j                  | _        t          | j        | j                  | _        t          | j        | j                  | _        t          j        | j        | j        d          | _        t          | j        | j        d          | _        d S )Nr  Frl   )r@   rA   )rG   rH   rq   multimodal_hidden_sizer  r@   vocab_offsetr  text_hidden_sizerI   	Embedding	embeddingr=   hard_embedding_normsoft_embedding_normry   embedding_projectionembedding_post_projection_norm)rN   r  r  rO   s      r3   rH   z"Gemma3nMultimodalEmbedder.__init__<  s    
 	&7&C#$1-:+6 + 7dot7RSS#1$2MSWS[#\#\#\ #1$2MSWS[#\#\#\ $&Id.I4K`gl$m$m$m!.<T=RX\X`mr.s.s.s+++r2   Nr  r  rX   c                    |du |duz  rt          d          ||                     |          }n2|                     || j        z
            }|                     |          }|                     |          }|                     |          S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r   r!  r  r  r   r"  r#  )rN   r  r  emb_normhard_embemb_norm_projs         r3   r^   z!Gemma3nMultimodalEmbedder.forwardO  s     -t";< 	[YZZZ$//>>HH~~i$2C&CDDH//99H11(;;22=AAAr2   r  )r*   r+   r,   r-   r   r!   r$   r#   rH   r   r.   r  re   r^   rf   rg   s   @r3   r  r  9  s        [[t !35H!HIt 't t t t t t* 1504B BE,-B  -B 
	B B B B B B B Br2   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                        e Zd Zi ZdZdef fdZd Zd Zd Z	d Z
dej        d	ej        fd
Z	 	 	 	 ddeej                 deej                 deej                 deej                 fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 deej                 dee         dee         dee         d	efd            Zdej        dej        d	eej        ej        f         fdZ xZS ) Gemma3nModelFrj   c                 ,   t                                          |           t          j        |j                  | _        |j        j        | _        t          j        |j                  }|| _        | j	        j
        | j	        j
        nd| _
        |j        j        | _        t          j        |j                  | _        t          |j        |j                  | _        t          |j        |j                  | _        |                                  d S )Nr  rQ   )rG   rH   r   from_configvision_configvision_towerr  r  language_modelrj   r  r  audio_configaudio_towerr  embed_visionembed_audior  )rN   rj   r.  rO   s      r3   rH   zGemma3nModel.__init__v  s       %19MNNN ,7".f6HIII,8<8P8\DK44bd*0*<*W'$01DEE5f6JFL^__4V5H&J\]]r2   c                 4    | j                                         S rZ   )r.  get_input_embeddingsra   s    r3   r4  z!Gemma3nModel.get_input_embeddings  s    "77999r2   c                 :    | j                             |           d S rZ   )r.  set_input_embeddingsrN   ra  s     r3   r6  z!Gemma3nModel.set_input_embeddings  s    0077777r2   c                     || _         d S rZ   r.  rN   decoders     r3   set_decoderzGemma3nModel.set_decoder  s    %r2   c                     | j         S rZ   r9  ra   s    r3   get_decoderzGemma3nModel.get_decoder  s    ""r2   pixel_valuesrX   c                 6   |                      |dd          j        }|                    |j        d         | j        j        j        | j        j                                      ddd          }|| j        j        j        dz  z  }| 	                    |          S )	a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        FT)r?  
do_poolingreturn_dictr   r   r    r  r  )
r-  r  r   r`   rj   r,  rq   vision_soft_tokens_per_imager   r1  )rN   r?  vision_outputss      r3   get_image_featureszGemma3nModel.get_image_features  s     **%%T + 
 

 	
 (// #K%1K4
 
 '!Q

	 	 	$+3?DD  ~ >>>r2   Nr  r  image_featuresaudio_featuresc                    || |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }| |                                  t          j        | j        j        t          j        |j                            k                        d          }n || j        j        k    }|| j        j        k    }|	                                }|
                    d                              |                              |j                  }|^||                                         |                                k    r.t          d| d|j        d         |j        d         z             |	                                }|
                    d                              |                              |j                  }|^||                                         |                                k    r.t          d| d|j        d         |j        d         z             ||fS )	z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  rQ   z6Image features and image tokens do not match: tokens: z, features r   r    z6Audio features and image tokens do not match: tokens: )r4  r.   rM   rj   image_token_idlongr   allaudio_token_idr+  r   	expand_asr   numelr   r`   )	rN   r  r  rG  rH  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r3   get_placeholder_maskz!Gemma3nModel.get_placeholder_mask  s    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;.4,,..L!;5:VcVjkkk  c"gg  "+dk.H!H!*dk.H!H+//11/99"==GGVVYYZgZnoo%-8J*K*Q*Q*S*SWeWkWkWmWm*m*m X  X  Xdrdxyzd{  M  S  TU  V  eV  X  X   ,//11/99"==GGVVYYZgZnoo%-8J*K*Q*Q*S*SWeWkWkWmWm*m*m X  X  Xdrdxyzd{  M  S  TU  V  eV  X  X   "#555r2   input_featuresrV  input_features_maskrD  r8   token_type_idsr  r  r  r  r  c                 8   |du |
duz  rt          d          ||n| j        j        }||n| j        j        }| |                                 |          }
t          j        |dk    || j        k               }t          j        ||t          j	        |                    }| j
                            |          }t          j        || j        j        k    || j        j        k               }| j        j        | j        j        z   dz
  }t          j        |||                              |
j                  }|                     |          }|                    d                              |
          }t          j        |||
          }
|| j        j        k    }| j        j        | j        j        z   dz
  }t          j        |||                              |
j                  }|                     |          }|                    d                              |
          }t          j        |||
          }
nd}|f|                     |          }|                    |
j        |
j                  }|                     ||
|          \  }}|
                    ||          }
|&|#|                     ||           \  } }t          j        | j        dz
  ggt
          j        | j                  }!|                     |!          }"t          j        |                    d          |"|           } | j        \  }#}$}%| j        j        |$z
  }&|"                    |#|&|%          }'t          j        | |'fd	          } |                     |
j        |
j                  } |                     ||
| 
          \  }}(|
                    |(|           }
 | j
        dd|||||
|||d|	d|})t?          |)j         |r|)j!        nd|)j"        |)j#        ||nd|| nd          S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r    )r  rQ   )r  rG  r  r   )r  rH  T)r  r  rV  rD  r8   r  r  r  r  rB  r  )r  r8   r9   r:   r(   r)   r1   )$r   rj   r  r  r4  r.   r   r  r   
zeros_liker.  r  r1  r  r2  r  r   r   r   rN  rF  r   rT  masked_scatterget_audio_featuresrM   rK  r`   audio_soft_tokens_per_imager  r   r'   r  r8   r9   r:   )*rN   r  r?  rU  rV  rV  rD  r8   rW  r  r  r  r  r  r  	lm_kwargsper_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskrG  rP  r   rH  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresrQ  r  s*                                             r3   r^   zGemma3nModel.forward  s   ^ -t";< 	[YZZZ1B1N--TXT_Tq$8$D  $+Jj 	  7D5577	BBM %*$5i1niRVRqFq$r$r!&+k2GTYTdenToTo&p&p##2GGH_``  +T.;;YIYIf=f K %)$5$BTEVEa$ade$e!${;	CXYY\\]j]qrr --8H-IIM#.#8#8#<#<#F#F}#U#U !K(<m][[M #d&6&CCJ#'#3#@4CSC^#^ab#b #k*iAUVVYYZgZnooO++o+FFL","6"6r":":"D"D]"S"S!K(;\=YYMM# #!44\BBN+..}/C]EXYYN$($=$=~ %> % %! *889K^\\M %*=*I)-)@)@ReQe)f)f&NJ "'!0C/D.EUZ`n`u!v!v!v!%!1!1<N!1!O!O"[)=)=b)A)ACUWeffN?M?S<m_#';#J]#Z %7%>%>?OQegv%w%w""Y8N'OUVWWWN+..}/C]EXYYN$($=$=~ %> % %!A! *889K^\\M%$% 
-)%+'/!5)
 
 
 
 *%77@JG33d!/)2>2JPT2@2LRV
 
 
 	
r2   c                 d    |                      ||          \  }}|                     |          |fS )a-  
        Projects the last hidden state from the audio encoder into language model space.

        Args:
            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
               The tensors corresponding to the input audio.
            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
               The attention mask for the input audio.

        Returns:
            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
        rC  )r0  r2  )rN   rU  rV  audio_outputsre  s        r3   r[  zGemma3nModel.get_audio_features`  s;     %)$4$4^EX$Y$Y!zm<<jHHr2   )NNNN)NNNNNNNNNNNNNN)r*   r+   r,   r  accepts_loss_kwargsr"   rH   r4  r6  r<  r>  r.   re   rF  r   r  r/   rT  r   r
   rd   r5   r^   r;   r[  rf   rg   s   @r3   r)  r)  k  s        &("}      : : :8 8 8& & &# # #?u| ? ? ? ? ?6 15596:6:(6 (6E,-(6   12(6 !!23	(6
 !!23(6 (6 (6 (6T  15486:156:37+/595959-1$(,0/3I
 I
E,-I
 u01I
 !!23	I

 !.I
 &el3I
 u/0I
 "%I
 !!12I
 !!12I
   12I
 )*I
 D>I
 $D>I
 'tnI
" 
'#I
 I
 I
 I
VI#lIAFI	u|U\)	*I I I I I I I Ir2   r)  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            #       z    e Zd Zi ZdgZdZdef fdZd Zd Z	d Z
d Zd	 Zed
             Zed             Zed             Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#deej                 deej                 deej                 deej                 deej                 deej                 dee         deej                 deej                 deej                 deej                 dee         dee         dee         deeej        f         def d                        Z	 	 	 	 	 	 	 	 	 	 	 	 d$ fd!	Zed"             Z xZ S )%Gemma3nForConditionalGenerationr  r	  rj   c                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S r  )rG   rH   r)  r	  rI   ry   r  rq   r  r  r  r  s     r3   rH   z(Gemma3nForConditionalGeneration.__init__~  se       !&))
y!3!?ASA^ejkkkr2   c                 4    | j                                         S rZ   )r	  r4  ra   s    r3   r4  z4Gemma3nForConditionalGeneration.get_input_embeddings  s    z..000r2   c                 :    | j                             |           d S rZ   )r	  r6  r7  s     r3   r6  z4Gemma3nForConditionalGeneration.set_input_embeddings  s    
''.....r2   c                 :    | j                             |           d S rZ   )r	  r<  r:  s     r3   r<  z+Gemma3nForConditionalGeneration.set_decoder  s    
w'''''r2   c                 4    | j                                         S rZ   )r	  r>  ra   s    r3   r>  z+Gemma3nForConditionalGeneration.get_decoder  s    z%%'''r2   c                 6    | j                             |          S rZ   )r	  rF  )rN   r?  s     r3   rF  z2Gemma3nForConditionalGeneration.get_image_features  s    z,,\:::r2   c                     | j         j        S rZ   )r	  r.  ra   s    r3   r.  z.Gemma3nForConditionalGeneration.language_model  s    z((r2   c                     | j         j        S rZ   )r	  r-  ra   s    r3   r-  z,Gemma3nForConditionalGeneration.vision_tower  s    z&&r2   c                      t          d          )Nz2Use embed_vision instead of multi_modal_projector.)AttributeErrorra   s    r3   multi_modal_projectorz5Gemma3nForConditionalGeneration.multi_modal_projector  s    QRRRr2   Nr   r  r?  rU  rV  rV  rD  r8   rW  r  r  r  r  r  r  r  rX   c                    ||n| j         j        }||n| j         j        } | j        d	|||||||||	|
||||dd|}|j        }t          |t                    rt          | d          n|}|                     |dd|ddf                   }| j         	                                j
        x}||z  }t          j        |          }||z  }d}|i|                                }|dddddf         }|dddf         }||dd|j        d          df                             |j                  }||                    |j                  dk                                             }||                    |j                  dk                                             }n(|                                }|                                }t%          j                    }|                    d| j         j        j                  }|                    d                              |j                  } |||          }t/          |||j        |j        |j        |j        |j                  S )
al  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r?  rU  rV  rV  rD  r8   rW  r  r  r  r  r  r  rB  .rQ   r    r   )r6   r7   r8   r9   r:   r(   r)   r1   )rj   r  r  r	  r  r3  rc   r  r  get_text_configr  r.   r   r[   r`   r   r   r   rI   CrossEntropyLossr   r  r  r5   r8   r9   r:   r(   r)   )rN   r  r?  rU  rV  rV  rD  r8   rW  r  r  r  r  r  r  r  r]  r  r9   r  r7   r  r6   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                r3   r^   z'Gemma3nForConditionalGeneration.forward  s   H 2C1N--TXT_Tq$8$D  $+Jj 	 $* 
%)) 3%+))'/!5
 
  !
 
&  18B>SV8W8Wk~ot444]kmAAA}aaa,?@AA'+{'B'B'D'D'\\#i55FZ''F55F\\^^F!#ssAAA+.L!#qrr'?L) (6aaa,:LQ:O9O9Q9Q6Q'R'U'UV\Vc'd'd$+,@,C,CFM,R,RVW,WXccee+,@,C,CLDW,X,X\],]^iikk+6688+6688*,,H&++B0G0RSSK&++B//22<3FGGK8K55D,#3!/) ' ; ' ;
 
 
 	
r2   Tc                      t                      j        |f||||||||
d|}|d         dk    r||d<   ||d<   |	|d<   |S )N)r8   r  rV  rD  r  r  r  rW  r   r?  rU  rV  )rG   prepare_inputs_for_generation)rN   r  r8   r  r  rD  r?  rU  rV  rV  rW  r  r  r  r\  model_inputsrO   s                   r3   r  z=Gemma3nForConditionalGeneration.prepare_inputs_for_generation%	  s    $ =uww<
+')%)))
 
 
 
  !!!+7L(-;L)*2EL./r2   c                     | j         j        S rZ   )r	  r0  ra   s    r3   r0  z+Gemma3nForConditionalGeneration.audio_towerN	  s    z%%r2   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNN)!r*   r+   r,   r  r  r  r"   rH   r4  r6  r<  r>  rF  propertyr.  r-  r  r   r   r   r.   r  r/   re   r
   rd   r   rc   r5   r^   r  r0  rf   rg   s   @r3   ru  ru  s  s        &("*+}      1 1 1/ / /( ( (( ( (; ; ; ) ) X) ' ' X' S S XS  15486:156:37+/595959-1$(,0/334!A
 A
E,-A
 u01A
 !!23	A

 !.A
 &el3A
 u/0A
 "%A
 !!12A
 !!12A
   12A
 )*A
 D>A
 $D>A
 'tnA
  c5</0!A
$ 
'%A
 A
 A
 ^ A
L  ' ' ' ' ' 'R & & X& & & & &r2   ru  )r  r  ru  r)  r  r  )r   NN)Nr    )`r  r{   collections.abcr   r   dataclassesr   typingr   r   r.   torch.nnrI   torch.nn.functionalr   rb  r  r	   cache_utilsr
   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   autor   configuration_gemma3nr!   r"   r#   r$   
get_loggerr*   r  r'   r5   Moduler=   ri   r   r   rC  ri  r  r  r  r  r  r  r  r  r  r	  r,  rM  re   rc   rR  r[   r;   r`  rc  re  r  r  r  r  r  r)  ru  __all__r1   r2   r3   <module>r     s:	  ,   . . . . . . . . ! ! ! ! ! ! " " " " " " " "                 ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) R R R R R R R R B B B B B B 9 9 9 9 9 9 O O O O O O O O K K K K K K K K F F F F F F F F & & & & & & _ _ _ _ _ _ _ _ _ _ _ _ _ _ 0 0 0 0 0 0       l l l l l l l l l l l l 
	H	%	%   
< < < < <!8 < <  <(   
< < < < <K < <  <:= = = = =RY = = =6g) g) g) g) g)BI g) g) g)T] ] ] ] ]BI ] ] ]@j, j, j, j, j,bi j, j, j,ZB7 B7 B7 B7 B7	 B7 B7 B7JF F F F F") F F FRO O O O ORY O O O8D D D D Dry D D D2( ( ( ( (ry ( ( (V       6G- G- G- G- G-/ G- G- G-T
S 
S 
S 
S 
SR\ 
S 
S 
S; ; ; ; ;RY ; ; ;$#5 #5 #5 #5 #5RY #5 #5 #5L^' ^' ^' ^' ^'ry ^' ^' ^'B!< !< !< !< !< !< !< !<H( ( (	UU\ 	U# 	U%, 	U 	U 	U 	U$ ## %  %I %< % 
 % <	 %
 U\* %  % e_ % e_ % 5<%& %  %  %  %N ,0. .|.	. 
. 5<(	.
 . . . .<t) t) t) t) t)29 t) t) t)n[ [ [ [ [8 [ [ [| 5 5 5 5 5_ 5 5 56 abbbt
 t
 t
 t
 t
- t
 t
 cbt
n ^___Y
 Y
 Y
 Y
 Y
/ Y
 Y
 `_Y
x/B /B /B /B /B	 /B /B /Bd   I I I I I) I I ID   W& W& W& W& W&&<o W& W& W&t  r2   