
     `i                        d dl mZmZmZ d dlZd dlmZ d dlmZm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;  e.j<        e=          Z> G d de          Z? G d dej@                  ZA G d dej@                  ZB G d de2          ZC G d d e3          ZD G d! d"e6          ZE G d# d$e          ZFe, G d% d&e'                      ZG G d' d(eG          ZH G d) d*e7          ZI G d+ d,e:          ZJ e,d-.           G d/ d0eGe                      ZKg d1ZLdS )2    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc                   j     e Zd ZdZdZdgZddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MoonshineConfiga"  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
            Percentage of the query and keys which will have rotary embedding.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesencoder_num_key_value_headsencoder_num_attention_headsencoder_num_hidden_layers)num_key_value_headsnum_attention_headsnum_hidden_layers              Ngelusilu   {Gz?   T     @?F        r"   c                    || _         || _        || _        || _        || _        || _        || _        ||}|| _        |	|}	|	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        t-          |             t/                      j        d||||d| d S )N)bos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id )
vocab_sizehidden_sizeintermediate_sizer1   decoder_num_hidden_layersr0   decoder_num_attention_headsr/   decoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actdecoder_hidden_actmax_position_embeddingsinitializer_rangerF   	use_cache
rope_thetarope_scalingpartial_rotary_factorrE   attention_biasattention_dropoutr   super__init__)selfrH   rI   rJ   r1   rK   r0   rL   r/   rM   rN   rO   rP   rQ   rR   rF   rS   rT   rU   rV   rE   rW   rX   rC   rD   kwargs	__class__s                             /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/moonshine/modular_moonshine.pyrZ   zMoonshineConfig.__init__   s   8 %&!2)B&)B&+F(+F(&.*E'+F(&.*E'+F(+F("4"4'>$!2&<#"$(%:""4,!2 	t$$$ 	
%%1#9		
 	

 	
 	
 	
 	
 	
    )r5   r6   r7   r8   r8   r9   r9   NNNr:   r;   r<   r=   r>   Tr?   Nr@   TFrA   r>   r"   )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_maprZ   __classcell__r]   s   @r^   r,   r,   2   s        { {z J#4"5<<8 M "#"#$%$%$($($(!! # !3D
 D
 D
 D
 D
 D
 D
 D
 D
 D
r_   r,   c                   B     e Zd Z fdZdej        dej        fdZ xZS )MoonshineEncoderMLPc                 
   t                                                       || _        t          |         | _        t          j        |j        |j                  | _	        t          j        |j        |j                  | _
        d S NrY   rZ   configr	   activation_fnnnLinearrI   rJ   fc1fc2r[   rn   
hidden_actr]   s      r^   rZ   zMoonshineEncoderMLP.__init__   sc    #J/9V/1IJJ9V5v7IJJr_   hidden_statesreturnc                     |                      |          }|                     |          }|                     |          }|S rl   )rr   ro   rs   )r[   rv   s     r^   forwardzMoonshineEncoderMLP.forward  s=    //**=99//r_   r`   ra   rb   rZ   torchTensorry   rg   rh   s   @r^   rj   rj      sc        K K K K KU\ el        r_   rj   c                   B     e Zd Z fdZdej        dej        fdZ xZS )MoonshineDecoderMLPc                    t                                                       || _        t          |         | _        t          j        |j        |j        dz            | _	        t          j        |j        |j                  | _
        d S )Nr"   rm   rt   s      r^   rZ   zMoonshineDecoderMLP.__init__  sh    #J/9V/1IA1MNN9V5v7IJJr_   rv   rw   c                     |                      |          }|                    dd          \  }}|                     |          |z  }|                     |          }|S )Nr"   )dim)rr   chunkro   rs   )r[   rv   gates      r^   ry   zMoonshineDecoderMLP.forward  s_    //+11!1<<t**400=@//r_   rz   rh   s   @r^   r~   r~     sc        K K K K KU\ el        r_   r~   c                   x    e Zd Zdededededef
 fdZ eddd	
          	 	 	 	 	 ddej	        de
eej	        ej	        f                  de
ej	                 de
e         de
ej                 de
ej	                 dee         deej	        e
ej	                 e
eej	                          f         fd            Z xZS )MoonshineAttentionrn   	layer_idx	is_causalr3   r2   c                 V   |                     ||d           t                                          ||           || _        t	          |d|j        |j        z            | _        | j        j	        0| j        j	        }|| j        |z   dz
  |z  z  }|| j        z
  | _
        d S d| _
        d S )N)r3   r2   head_dimr>   r   )updaterY   rZ   r   getattrrI   r3   r   rn   rN   head_dim_padding)	r[   rn   r   r   r3   r2   target_multipletarget_head_dimr]   s	           r^   rZ   zMoonshineAttention.__init__  s     	.AZmnnooo+++"
F4F&Jd4dee ;2>"kEO-$-/2QTU2UZi1ijO$3dm$CD!!!$%D!!!r_   past_key_valuer.   4.58new_nameversionNrv   position_embeddingsattention_maskcache_positionkey_value_statesr\   rw   c                 v   |j         d d         \  }}	|                     |                              ||	| j        j        | j                                      dd          }
|d u}|?|j                            | j	                  }|rd|j        | j	        <   |j
        }n|j        }||n|}|r3|r1|r/|j        | j	                 j        }|j        | j	                 j        }n|                     |                              |d| j        j        | j                                      dd          }|                     |                              |d| j        j        | j                                      dd          }|r$|"|                    ||| j	        d|i          \  }}|sB|\  }}t%          |
|||          \  }
}|&|||d}|                    ||| j	        |          \  }}t&          }| j        j        dk    rt*          | j        j                 }| j        o	|d u o|	dk    }| j        dk    rt0          j        j                            |
d| j        f          }
t0          j        j                            |d| j        f          }t0          j        j                            |d| j        f          } || |
|||f| j        sd	n| j        | j        |d
|\  }}| j        dk    r|dd | j         f         }|                    ||	d                                           }| !                    |          }||fS )Nr   r>   r"   Tr   )sincosr   eagerr   rA   )dropoutscalingr   .)"shapeq_projviewrn   r2   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r%   r(   _attn_implementationr   r   r   r{   rp   
functionalpadtrainingrX   r   reshape
contiguouso_proj)r[   rv   r   r   r.   r   r   r\   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statesr   r   cache_kwargsattention_interfacer   attn_outputattn_weightss                          r^   ry   zMoonshineAttention.forward4  s    #("-
U KK&&++C8WY]Yfggqqrsuvww 	 .T9&(377GGJ! G=A*4>:"1"G"1"F .>-I))} 	/ 	j 	(/?DJ*1$.AHLL N++c2t{>NN1a  N++c2t{>NN1a 
 " o&A+:+A+Adn?OQ_>`, ,(
L " 	*HC';L*VY[^'_'_$L**'*3.YY+:+A+Adnl, ,(
L )@;+w66"9$+:Z"[NK~'=K%!)	 1$$ 8.22<!TEZA[\\L,00aAV=WXXJ 8.22<!TEZA[\\L$7$7
%
  $}HCC$2HL
%
 
%
 
%
 
%
!\  1$$%c+Cd.C-C+C&CDK!))#ub99DDFFkk+..L((r_   )NNNNN)r`   ra   rb   r,   intboolrZ   r!   r{   r|   r   tupler
   
LongTensorr   r   ry   rg   rh   s   @r^   r   r     so       && & 	&
 !& !& & & & & &* _%0A6RRR LP15+/5937U) U)|U) &eEL%,,F&GHU) !.	U)
 "%U) !!12U) #5<0U) -.U) 
u|Xel3XeEL>Q5RR	SU) U) U) SRU) U) U) U) U)r_   r   c                       e Zd ZdS )MoonshineRotaryEmbeddingN)r`   ra   rb   rG   r_   r^   r   r     s        Dr_   r   c                   (     e Zd Zdedef fdZ xZS )MoonshineEncoderLayerrn   r   c                 F   t                                          ||           t          ||d|j        |j                  | _        t          ||j                  | _        t          j
        |j        d          | _        t          j
        |j        d          | _        d S )NFrn   r   r   r3   r2   bias)rY   rZ   r   r0   r/   	self_attnrj   rO   mlprp   	LayerNormrI   input_layernormpost_attention_layernormr[   rn   r   r]   s      r^   rZ   zMoonshineEncoderLayer.__init__  s    ++++ & B & B
 
 
 'vv/HII!|F,>UKKK(*V5Ge(T(T(T%%%r_   )r`   ra   rb   r,   r   rZ   rg   rh   s   @r^   r   r     sW        U U3 U U U U U U U U U Ur_   r   c            !           e Zd Zddedee         f fdZ eddd          	 	 	 	 	 	 	 	 	 	 dd
ej	        deej	                 deej	                 deej	                 deej
                 deej
                 dee         dee         deej
                 deeej	        ej	        f                  deeej	        ej	        f                  dee         deej        eeej        ej        f                  f         fd            Z xZS )MoonshineDecoderLayerNrn   r   c                    t                                                       |j        | _        t          ||d|j        |j                  | _        t          ||d|j        |j                  | _        t          ||j	                  | _
        t          j        |j        d          | _        t          j        |j        d          | _        t          j        |j        d          | _        d S )NTr   Fr   )rY   rZ   rI   r   rL   rM   r   encoder_attnr~   rP   r   rp   r   r   r   final_layernormr   s      r^   rZ   zMoonshineDecoderLayer.__init__  s    !-+ & B & B
 
 
 / & B & B
 
 
 'vv/HII!|F,>UKKK(*V5Ge(T(T(T%!|F,>UKKKr_   r   r.   r   r   Frv   r   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsrS   r   r   encoder_position_embeddingsr\   rw   c                 F   |}|                      |          } | j        d||||||	|
d|\  }}||z   }|9|}|                     |          }|                     |||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)rv   r   r   r.   rS   r   r   )rv   r   r   r.   rS   rG   )r   r   r   r   r   r   )r[   rv   r   r   r   r   r   r.   rS   r   r   r   r\   residual_s                  r^   ry   zMoonshineDecoderLayer.forward  s      !,,];;)4> 	
')%+) 3	
 	
 	
 	
q !=0 ,$H 99-HHM#00+!65 /#  1    M1 %}4M ,,];;// =0r_   rl   )
NNNNNNFNNN)r`   ra   rb   r,   r   r   rZ   r!   r{   r|   r   r
   r   r   r   r   FloatTensorry   rg   rh   s   @r^   r   r     s       L L L8C= L L L L L L0 _%0A6RRR 268<9=37;?+/$)59KOSW. .|. !..  (5	.
 !) 6. u/0. 'u'78. "%. D>. !!12. &eEL%,,F&GH. &.eEL%,4N.O%P. +,. 
u (51BEDU1U+V"WW	X. . . SR. . . . .r_   r   c                   P    e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdej        fdZd	S )
MoonshinePreTrainedModelrn   modelinput_valuesTr   r   input_lengthsc                     t          |dz
  dz  dz             }t          |dz
  dz  dz             }t          |dz
  dz  dz             }|S )zH
        Computes the output length of the convolutional layers
           @   r>      r   r"   )r   )r[   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r^    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sc     "=3#6""<q"@AA!#6#:a"?!"CDD!#6#:a"?!"CDD""r_   N)r`   ra   rb   r,   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr{   r   r   rG   r_   r^   r   r     sn         $O&*#02IJN!#e>N # # # # # #r_   r   c            
            e Zd ZdZdZeedZdef fdZ	de
j        fdZde
j        fd	Ze	 ddej        deej                 dee         defd            Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsrv   rn   c                 `   t                                                     | _        j        }t	          j        d|ddd          | _        t	          j        |d|z  dd	          | _        t	          j        d|z  |dd	          | _        t	          j	        d|d
          | _
        t                    | _        t	          j        fdt          j                  D                       | _        t	          j        |d          | _        d| _        |                                  d S )Nr>   r   r   F)kernel_sizestrider   r"   r   r   )r   r   gh㈵>)
num_groupsnum_channelsepsrn   c                 0    g | ]}t          |          S rG   )r   .0idxrn   s     r^   
<listcomp>z-MoonshineEncoder.__init__.<locals>.<listcomp>  $    cccC"63//cccr_   r   )rY   rZ   rn   rI   rp   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListranger1   r   r   
layer_normgradient_checkpointing	post_init)r[   rn   	embed_dimr]   s    ` r^   rZ   zMoonshineEncoder.__init__  s      &	Yq)ReTTT
Yy!i-QqQQQ
Yq9}iQqQQQ
PTUUU2&AAAmcccc5Aa;b;bccc
 
 ,yu===&+#r_   rw   c                     | j         S rl   r  r[   s    r^   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings%  s
    zr_   valuec                     || _         d S rl   r  )r[   r  s     r^   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings(  s    


r_   Nr   r\   c                    |                     d          }t          j                            |                     |                    }|                     |          }t          j                            |                     |                    }t          j                            |                     |                    }|	                    ddd          }|| 
                    |j        d                   }d}|ddd|f         dd|f         }| j        j        dk    r|d	k                                    r|nd}n;| j        j        d
k    rt          ||j                  }nt#          ||j                  }t%          j        d|j        d         |j                                       d          }|                     ||          }| j        D ]}	 |	|f|||d|}|                     |          }t1          |          S )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r>   r   r"   Nr     .flash_attention_2rA   sdpadevice)r   r   r   )last_hidden_state)	unsqueezerp   r   tanhr  r  r:   r  r  permuter   r   rn   r   anyr   dtyper   r{   aranger  r	  r   r  r   )
r[   r   r   r\   rv   mask_lendownsample_strider   r   encoder_layers
             r^   ry   zMoonshineEncoder.forward+  s   , $--a00**4::l+C+CDD}55**4::m+D+DEE**4::m+D+DEE%--aA66 %<<^=QRT=UVVH *+C1D1D3D1D,DEc9H9nUN{/3FFF4Bc4I3N3N3P3P!ZVZ1V;;!D^UbUh!i!i!;NML_!`!`|A}':1'=mFZ[[[eefghh"oom\JJ![ 	 	M)M-)$7	 
  MM 66&+
 
 
 	
r_   rl   )r`   ra   rb   rc   r   r   r   _can_record_outputsr,   rZ   rp   Moduler  r  r   r{   r   r   r|   r   r   r   ry   rg   rh   s   @r^   r   r     s         %O(. 
      $bi    ")      268
 8
'8
 !.8
 +,	8

 
!8
 8
 8
 8
 8
 8
 8
 8
r_   r   c                       e Zd ZdZ eedd          e eedd          dZdef fdZ	e
	 	 	 	 	 	 	 	 	 ddeej                 d
eej                 deej                 dee         deej                 dee         deej                 deej                 deej                 dee         deeef         fd            Z xZS )MoonshineDecoder	input_idsr>   r   )index
layer_namer   )r   rv   cross_attentionsrn   c                     t                                                     t          j        j        d          | _        t          j        fdt          j                  D                       | _	        d S )NFr   c                 0    g | ]}t          |          S rG   )r   r   s     r^   r  z-MoonshineDecoder.__init__.<locals>.<listcomp>s  r  r_   )
rY   rZ   rp   r   rI   normr
  r  rK   r   r[   rn   r]   s    `r^   rZ   zMoonshineDecoder.__init__o  sn       L!3%@@@	mcccc5Aa;b;bccc
 
r_   Nr   r   r.   inputs_embedsrS   r   r   r   r\   rw   c
                    |du |duz  rt          d          ||                     |          }|r8|6t          t          | j                  t          | j                            }|B||                                nd}t          j        |||j        d         z   |j	                  }||
                    d          }t          | j        |||||          }|}|                     ||          }|	|j        d         }d	}|	d
dd|f         d
d|f         }	| j        j        dk    r|	dk                                    r|	nd}	nS| j        j        dk    r"t          |	|j        |j        d                   }	n!t#          |	|j        |j        d                   }	| j        D ]} ||||f|	|||||d|
}|                     |          }t)          ||r|nd          S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r>   r  )rn   input_embedsr   r   r.   r   r  .r  rA   r  )r   r   r.   rS   r   r   )r  r.   )
ValueErrorembed_tokensr   r   rn   get_seq_lengthr{   r#  r   r  r  r   r	  r   r!  r   r"  r   r   r1  r   )r[   r+  r   r   r.   r3  rS   r   r   r   r\   past_seen_tokenscausal_maskrv   r   r$  r%  decoder_layers                     r^   ry   zMoonshineDecoder.forwardv  s   0 -t";< 	[YZZZ  --i88M 	v01,dk2R2R2RT`hlhsTtTtTtuuO!CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L(;&))+%
 
 
 &"oom\JJ!-,226H *%;CATATCTAT<T%UVY[d\d[dVd%e"{/3FFFDZ^aDaCfCfChCh)r)?)?nr&&1V;;)L*M,?ATUWAX* *&& *D*M,?ATUWAX* *& "[ 	 	M)M% (>) /#-$7   MM 		-008+/8BOOd
 
 
 	
r_   )	NNNNNNNNN)r`   ra   rb   r   r   r   r   r'  r,   rZ   r   r   r{   r   r|   r
   r   r   r   r   r   r   r   ry   rg   rh   s   @r^   r*  r*  g  s       !O$n%7q[YYY.*N+=QSabbb 
 
 
 
 
 
 
  151537+/59$(59=A9=W
 W
E,-W
 !.W
 u/0	W

 "%W
   12W
 D>W
 !!12W
  ((9:W
 !) 6W
 +,W
 
u--	.W
 W
 W
 W
 W
 W
 W
 W
r_   r*  c                      e Zd Zee	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee	e	ej                                   dee
ee	ej                 f                  dee	ej                          d	ee	ej                          d
ee         deej                 dee         defd                        ZdS )MoonshineModelNr   r   decoder_input_idsdecoder_attention_maskencoder_outputsr.   decoder_inputs_embedsdecoder_position_idsrS   r   r\   rw   c                     | | j         |fd|i|} | j        d||||j        ||||	|
d	|}t          |j        |j        |j        |j        |j        |j        |j        |j                  S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        Nr   )	r+  r   r   r   r.   r3  r   rS   r   )r  r.   decoder_hidden_statesdecoder_attentionsr.  encoder_last_hidden_stater   encoder_attentionsrG   )encoderdecoderr  r   r.   rv   r   r.  )r[   r   r   r?  r@  rA  r.   rB  rC  rS   r   r\   decoder_outputss                r^   ry   zMoonshineModel.forward  s    \ "/;t|L/r/rYg/rkq/r/rOEQT\ F
'1#1"1"C+/-)F
 F
 F
 F
 "-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r_   )
NNNNNNNNNN)r`   ra   rb   r   r   r   r{   r   r   r   r   r   r   r   r   r   ry   rG   r_   r^   r>  r>    sa        59598<=AEIZ^DHBF$(59E
 E
u01E
 !!12E
 $E$45	E

 !))9 :E
 "%e.?(@"ABE
 "%(;U5CT=U(U"VWE
  (e.?(@AE
 'uU-='>?E
 D>E
 !!12E
 +,E
 
E
 E
 E
 ^ E
 E
 E
r_   r>  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       e Zd ZdgZdef fdZd Zd Zd Zd Z	de
j        fd	Zee	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deeeej                                   deeeeej                 f                  deeej                          deeej                          dee         deej                 deej                 dee         defd                        Z xZS )!MoonshineForConditionalGenerationzproj_out.weightrn   c                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NFr   )
rY   rZ   r>  r   rp   rq   rI   rH   proj_outr  r2  s     r^   rZ   z*MoonshineForConditionalGeneration.__init__$  s`       #F++
	&"4f6GeTTT 	r_   c                 4    | j                                         S rl   )r   get_encoderr  s    r^   rR  z-MoonshineForConditionalGeneration.get_encoder,      z%%'''r_   c                 4    | j                                         S rl   )r   get_decoderr  s    r^   rU  z-MoonshineForConditionalGeneration.get_decoder/  rS  r_   c                     | j         S rl   rP  r  s    r^   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings2  s
    }r_   c                     || _         d S rl   rW  )r[   new_embeddingss     r^   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings5  s    &r_   rw   c                 4    | j                                         S rl   )r   r  r  s    r^   r  z6MoonshineForConditionalGeneration.get_input_embeddings8  s    z..000r_   Nr   r   r?  r@  rA  r.   rB  rC  rS   r   labelsr\   c                 ~   |)|'|%t          || j        j        | j        j                  } | j        |f||||||||	|
d	|}|                     |j                  }d}|"|                     ||| j        j                  }t          |||j
        |j        |j        |j        |j        |j        |j        	  	        S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r   r?  rA  r@  r.   rB  rC  rS   r   )logitsr]  rH   )	lossr_  r.   rE  rF  r.  rG  r   rH  )r*   rn   pad_token_idrF   r   rP  r  loss_functionrH   r   r.   rE  rF  r.  rG  r   rH  )r[   r   r   r?  r@  rA  r.   rB  rC  rS   r   r]  r\   outputsr_  r`  s                   r^   ry   z)MoonshineForConditionalGeneration.forward;  s   f  (-B-J$6DK4dk6X% %! '1dj'
)/+#9+"7!5)'
 '
 '
 '
 w899%%VFt{Oe%ffD#3")"?&9$5&-&G")"?&9

 

 

 
	
r_   )NNNNNNNNNNN)r`   ra   rb   _tied_weights_keysr,   rZ   rR  rU  rX  r[  rp   r(  r  r   r   r   r{   r   r   r   r   r   r   r   r   r   ry   rg   rh   s   @r^   rN  rN    s        ,,      ( ( (( ( (  ' ' '1bi 1 1 1 1  59598<=AEIZ^DHBF$(59-1T
 T
u01T
 !!12T
 $E$45	T

 !))9 :T
 "%e.?(@"ABT
 "%(;U5CT=U(U"VWT
  (e.?(@AT
 'uU-='>?T
 D>T
 !!12T
 )*T
 +,T
 
T
 T
 T
 ^ T
 T
 T
 T
 T
r_   rN  )r,   r>  r   rN  )Mtypingr   r   r   r{   torch.nnrp   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   configuration_utilsr   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    utils.deprecationr!   glm.modeling_glmr#   r$   r%   llama.modeling_llamar&   r'   r(   whisper.modeling_whisperr)   r*   
get_loggerr`   loggerr,   r(  rj   r~   r   r   r   r   r   r   r*  r>  rN  __all__rG   r_   r^   <module>r|     s   - , , , , , , , , ,        I I I I I I I I ! ! ! ! ! ! C C C C C C C C C C 3 3 3 3 3 3 ) ) ) ) ) ) / / / / / / g g g g g g g g B B B B B B 9 9 9 9 9 9              : 9 9 9 9 9 F F F F F F F F & & & & & & R R R R R R R R R R R R 0 0 0 0 0 0 U U U U U U U U U U Y Y Y Y Y Y Y Y Y Y G G G G G G G G 
	H	%	%J
 J
 J
 J
 J
& J
 J
 J
Z    ")       ")    l) l) l) l) l) l) l) l)^	 	 	 	 	1 	 	 	U U U U U- U U U"H H H H H6 H H HV # # # # # # # #._
 _
 _
 _
 _
/ _
 _
 _
Dg
 g
 g
 g
 g
z g
 g
 g
TH
 H
 H
 H
 H
\ H
 H
 H
V   
p
 p
 p
 p
 p
(@/ p
 p
 
p
f  r_   