
     `i/                        d dl Z d dlmZmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z!  G d de          Z" G d de          Z# G d de          Z$ ed           G d de                      Z% G d dej&                  Z' ed           G d de$e                      Z(g d Z)dS )!    N)OptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       e Zd ZdS )VoxtralAttentionN__name__
__module____qualname__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   '           Dr"   r   c                       e Zd ZdS )VoxtralEncoderLayerNr   r!   r"   r#   r&   r&   +   r$   r"   r&   c                   &    e Zd ZdZdZdZdZdZdZdS )VoxtralPreTrainedModelTN)r   r   r    _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr!   r"   r#   r(   r(   /   s4         "&!"&r"   r(   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                   D    e Zd ZeedZe	 ddee         fd            Z	dS )VoxtralEncoder)
attentionshidden_statesNkwargsc           	         | j         j        | j        j        d         z  | j        j        d         z  }|j        d         |k    r$t          d| d|j        d          d| d          |                    | j        j        j	        | j        j        j
                  }t          j                            |                     |                    }t          j                            |                     |                    }|                    ddd	          }| j        j        }||z                       |j	                  }t          j                            || j        | j        
          }t%          | j                  D ]\  }}	 |	||d          }
|
d         }|                     |          }t+          |          S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptrainingN)attention_masklayer_head_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr7   r8   r   
functionalgelupermuteembed_positionsdropoutr:   	enumeratelayers
layer_normr
   )selfinput_featuresr;   r3   expected_seq_lengthinputs_embeds	embed_posr2   idxencoder_layerlayer_outputss              r#   forwardzVoxtralEncoder.forwardD   s   & #k>ARSTAUUX\XbXijkXll#'::: LM`  L  Ln|  oC  DF  oG  L  L  vI  L  L  L   (**1B1HQUQ[QbQi*jj**4::n+E+EFF**4::m+D+DEE%--aA66(/	&266}7JKK--mt|VZVc-dd"+DK"8"8 	- 	-C)M- $  M
 *!,MM66+
 
 
 	
r"   N)
r   r   r    r   r&   _can_record_outputsr   r   r   rW   r!   r"   r#   r0   r0   9   sd         ', 
  -
 -
 +,	-
 -
 -
 -
 -
 -
r"   r0   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorr>   c                 6   t                                                       t          j        |j        j        |j        j        d          | _        t          |j
                 | _        t          j        |j        j        |j        j        d          | _        d S )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rO   r>   	__class__s     r#   r_   z#VoxtralMultiModalProjector.__init__v   sv    	&"5"GI[Ignsttt&56	&"4"@&BTB`glmmmr"   c                     |                      |          }|                     |          }|                     |          }|S rX   )re   rg   rh   )rO   audio_featuresr2   s      r#   rW   z"VoxtralMultiModalProjector.forward|   s;    n55//m44r"   )r   r   r    r   r_   rW   __classcell__rj   s   @r#   r[   r[   u   sZ        n} n n n n n n      r"   r[   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZddiZddgdgfiZdgZ fdZd Zd	 Z	d
 Z
d Zd Zd Zdej        fdZdej        fdZee	 	 	 	 	 	 	 	 	 	 d deej                 deej                 deej                 deej                 dee         deej                 deej                 dee         deej                 deeej        f         dee         defd                        Z fdZ xZ S )!VoxtralForConditionalGenerationzlm_head.weightlm_headcolwise_repr2   logitsrJ   c                 4   t                                          |           |j        j        | _        t	          j        |j                  | _        t          j        |j                  | _	        t          |          | _        |                                  d S rX   )r^   r_   rc   
vocab_sizer   from_configra   audio_towerr   language_modelr[   multi_modal_projector	post_initri   s     r#   r_   z(VoxtralForConditionalGeneration.__init__   s|        ,7$01DEE2>v?QRR%?%G%G" 	r"   c                 4    | j                                         S rX   )rx   get_input_embeddingsrO   s    r#   r|   z4VoxtralForConditionalGeneration.get_input_embeddings   s    "77999r"   c                 :    | j                             |           d S rX   )rx   set_input_embeddings)rO   values     r#   r   z4VoxtralForConditionalGeneration.set_input_embeddings   s    0077777r"   c                 4    | j                                         S rX   )rx   get_output_embeddingsr}   s    r#   r   z5VoxtralForConditionalGeneration.get_output_embeddings   s    "88:::r"   c                 :    | j                             |           d S rX   )rx   set_output_embeddings)rO   new_embeddingss     r#   r   z5VoxtralForConditionalGeneration.set_output_embeddings   s    11.AAAAAr"   c                 :    | j                             |           d S rX   )rx   set_decoder)rO   decoders     r#   r   z+VoxtralForConditionalGeneration.set_decoder   s    ''00000r"   c                 4    | j                                         S rX   )rx   get_decoderr}   s    r#   r   z+VoxtralForConditionalGeneration.get_decoder   s    "..000r"   rP   c                     |                      |          }|j        }|                    d| j        j        j                  }|                     |          }|S )a  
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        r5   )rw   r=   reshaper>   ra   rb   ry   )rO   rP   audio_outputsaudio_hidden_statesaudio_embedss        r#   get_audio_featuresz2VoxtralForConditionalGeneration.get_audio_features   sX     ((88+=199"dk>V>hii112EFFr"   c                 `    t          j        dt                     |                     |          S )NzUThe method `get_audio_embeds` is deprecated. Please use `get_audio_features` instead.)warningswarnFutureWarningr   )rO   rP   s     r#   get_audio_embedsz0VoxtralForConditionalGeneration.get_audio_embeds   s2    cer	
 	
 	
 &&~666r"   Nr   	input_idsr;   position_idspast_key_valuesrR   labels	use_cachecache_positionlogits_to_keepr3   returnc                 t   | |                                  |          }||~|                     |          }|| j        j        k                        d          }|                    |                    |j                  |                    |j                            } | j        d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```Nr5   )r;   r   r   rR   r   r   r   r   r!   )	r|   r   r>   audio_token_id	unsqueezemasked_scatterrE   r8   rx   )rO   r   rP   r;   r   r   rR   r   r   r   r   r3   r   audio_token_maskoutputss                  r#   rW   z'VoxtralForConditionalGeneration.forward   s    b  7D5577	BBM%)*?22>BBL !*T[-G GRRSUVV)88 ##M$899<??=K_;`;` M ,?4+> 
,
)%+'))
,
 
,
 
,
 
,
 r"   c                     |                     dd           }|                    d          } t                      j        |i |}||d         dk    r||d<   |S )NrP   r   r   )popgetr^   prepare_inputs_for_generation)rO   argsr3   rP   r   model_inputsrj   s         r#   r   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  sl      $4d;;$455<uww<dMfMM%.*;q*@*@-;L)*r"   )
NNNNNNNNNr   )!r   r   r    _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictr_   r|   r   r   r   r   r   torchFloatTensorr   r   r   r   r   
LongTensorTensorr   boolr   intr   r   r   rW   r   rm   rn   s   @r#   rp   rp      sE        ++=)H_-z:;H$5#6     : : :8 8 8; ; ;B B B1 1 11 1 11B    *7u/@ 7 7 7 7  156:1537+/59-1$(5934F FE,-F !!23F !.	F
 u/0F "%F   12F )*F D>F !!12F c5</0F +,F 
 F F F ^ FP        r"   rp   )r(   r0   rp   )*r   typingr   r   r   r   activationsr   cache_utilsr   
generationr	   modeling_outputsr
   r   r   processing_utilsr   utilsr   r   r   utils.genericr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r&   r(   r0   Moduler[   rp   __all__r!   r"   r#   <module>r      s     " " " " " " " "        ! ! ! ! ! !             ) ) ) ) ) ) ` ` ` ` ` ` ` ` ` ` & & & & & & I I I I I I I I I I / / / / / / 2 2 2 2 2 2 2 2            1 0 0 0 0 0	 	 	 	 	* 	 	 		 	 	 	 	0 	 	 	    6      
4
 4
 4
 4
 4
& 4
 4
 
4
n          
S S S S S&<o S S 
Sl Z
Y
Yr"   