
     `i[                        d dl Z d dlZd dlmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%  ej&        e'          Z(	 	 	 d,dej)        dej*        dej*        dej*        deej*                 dee+         de+deej*                 fdZ, G d dej)                  Z- G d de          Z.e G d  d!e                      Z/ ed"#           G d$ d%e/                      Z0 G d& d'ej)                  Z1 ed(#           G d) d*e/e                      Z2g d+Z3dS )-    N)CallableOptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfig        modulequerykeyvalueattention_maskscalingdropout	head_maskc                 8   ||                     d          dz  }t          j        ||                    dd                    |z  }	|-|j        dk    r"|	|d d d d d d d |j        d         f         z   }	t          j                            |	d          }	||	|	                    dddd          z  }	t          j        
                    |	|| j        	          }	t          j        |	|          }
|
                    dd                                          }
|
|	fS )
N      r   r      )dimr   ptraining)sizetorchmatmul	transposendimshaper   
functionalsoftmaxviewr$   r.   
contiguous)r   r   r    r!   r"   r#   r$   r%   kwargsattn_weightsattn_outputs              /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forwardr=   -   s    **R..D(<s}}Q':':;;gEL!n&9Q&>&>#nQQQ111o	"o5M&NN=((2(>>L#innQAq&A&AA=((6?([[L,|U33K''1--88::K$$    c                   6    e Zd ZdZ	 	 	 	 	 	 ddededed	ed
ededee         dee         f fdZ	de
j        dedefdZ	 	 	 dde
j        dee
j                 dee
j                 dedee
j        ee
j                 eee
j                          f         f
dZ xZS )VoxtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr$   
is_decoderbias	is_causal	layer_idxconfigc	                 p   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        |*|r(t                              d| j        j         d           || _        t!          j        ||d          | _        t!          j        |||          | _        t!          j        |||          | _        t!          j        |||          | _        d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r(   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrD   )super__init__rA   rB   r$   head_dimrG   
ValueErrorr#   rC   rE   loggerwarning_once	__class____name__rF   r   Lineark_projv_projq_projout_proj)
selfrA   rB   r$   rC   rD   rE   rF   rG   rP   s
            r<   rK   zVoxtralAttention.__init__N   sW    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$",4>+B , , ,  
 #i	95AAAi	94@@@i	94@@@	)YTBBBr>   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr   r   )r7   rB   rL   r2   r8   )rW   rX   rY   rZ   s       r<   _shapezVoxtralAttention._shapev   s<    {{3GGQQRSUVWWbbdddr>   hidden_statesr"   layer_head_maskoutput_attentionsreturnc                 d   |                                 \  }}}|                     |                     |          | j        z  ||          }	|                     |                     |          d|          }
|                     |                     |          d|          }t          }| j        j        dk    rt          | j        j                 } || |	|
||f| j
        sdn| j        d||d|\  }}|                    ||d                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr'   eagerr         ?)r$   r#   r_   r%   )r/   r\   rU   r#   rS   rT   r=   rG   _attn_implementationr   r.   r$   reshaper8   rV   )rW   r]   r"   r^   r_   r9   rZ   tgt_len_query_states
key_statesvalue_statesattention_interfacer;   r:   s                  r<   forwardzVoxtralAttention.forwardy   sD    (,,..Wa {{4;;}#=#=#LgWZ[[[[]!;!;REE
{{4;;}#=#=r3GG(?;+w66"9$+:Z"[$7$7%
  $}>CC$,/%%
 %
 %
 %
!\ "))#w;;FFHHmmK00L((r>   )r   FTFNN)NNF)rQ   
__module____qualname____doc__intfloatboolr   r   rK   r0   Tensorr\   tuplerl   __classcell__rP   s   @r<   r@   r@   K   sz       GG  #'*.&C &C&C &C 	&C
 &C &C &C C=&C '&C &C &C &C &C &CPeU\ eC ec e e e e 2626"')) ))|)) !.)) "%,/	))
  )) 
u|Xel3XeEL>Q5RR	S)) )) )) )) )) )) )) ))r>   r@   c                   l     e Zd Zdef fdZ	 d
dej        dej        dej        dedej        f
d	Z xZ	S )VoxtralEncoderLayerrG   c                    t                                                       |j        | _        t	          | j        |j        |j        |          | _        t          j	        | j                  | _
        |j        | _        t          |j                 | _        |j        | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j	        | j                  | _        d S )N)rA   rB   r$   rG   )rJ   rK   d_modelrA   r@   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr$   r   activation_functionactivation_fnactivation_dropoutrR   encoder_ffn_dimfc1fc2final_layer_normrW   rG   rP   s     r<   rK   zVoxtralEncoderLayer.__init__   s    )n4,	
 
 
 %'L$@$@!~#F$>?"(";9T^V-CDD9V3T^DD "T^ < <r>   Fr]   r"   r^   r_   r`   c                    |}|                      |          }|                     ||||          \  }}t          j                            || j        | j                  }||z   }|}|                     |          }|                     |                     |                    }t          j                            || j	        | j                  }| 
                    |          }t          j                            || j        | j                  }||z   }|j        t          j        k    r9t          j        |j                  j        dz
  }t          j        || |          }||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r]   r"   r^   r_   r,   i  )minmax)r   r}   r   r5   r$   r.   r   r   r   r   r   dtyper0   float16finfor   clamp)rW   r]   r"   r^   r_   residualr:   clamp_values           r<   rl   zVoxtralEncoderLayer.forward   sW   $ !11-@@&*nn')+/	 '5 '
 '
#| --mt|VZVc-dd =0 --m<<**488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0%-//+m&9::>EK!KK<[YYYMl**r>   )F)
rQ   rm   rn   r   rK   r0   rs   rr   rl   ru   rv   s   @r<   rx   rx      s        =} = = = = = =. #()+ )+|)+ )+ 	)+
  )+ 
)+ )+ )+ )+ )+ )+ )+ )+r>   rx   c                   H    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZdZd ZdS )VoxtralPreTrainedModelrG   modelTNpast_key_valuesc                    t          | j        d          r| j        j        n| j        j        j        }t	          |t
          j        t
          j        f          rJ|j        j	        
                    d|           |j         |j        j	                                         d S d S t	          |t
          j                  r?|j        j	                            d           |j        j	                                         d S t	          |t
          j                  rS|j        j	        
                    d|           |j        -|j        j	        |j                                                  d S d S d S )Ninitializer_ranger   )meanstdrc   )hasattrrG   r   audio_config
isinstancer   rR   Conv1dweightdatanormal_rD   zero_r~   fill_	Embeddingpadding_idx)rW   r   r   s      r<   _init_weightsz$VoxtralPreTrainedModel._init_weights   sU   
 t{$788<DK))); 	 fry")455 
	?M&&CS&999{& &&((((( '&-- 	?M$$S)))K""$$$$$-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r>   )rQ   rm   rn   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraphr    r>   r<   r   r      sl         &*#"3N "&!? ? ? ? ?r>   r   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                        e Zd ZU dZeed<   dZdgZee	dZ
def fdZd Zdej        fd	Zd
ej        fdZe	 ddee         fd            Zdej        fdZ xZS )VoxtralEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`VoxtralEncoderLayer`].

    Args:
        config: VoxtralEncoderConfig
    rG   input_featuresrx   )
attentionsr]   c                    t                                                     j        | _        j        | _        j        }j        | _        j        | _        j	        | _	        j
        rt          j        |          nd| _        t          j        | j        |dd          | _        t          j        ||ddd          | _        t          j        | j	        |          | _        | j                            d           t          j        fdt-          j                  D                       | _        t          j        j                  | _        t          j        dd	          | _        d| _        |                                  d S )
Nrc   r   r   )kernel_sizepaddingr   )r   strider   Fc                 .    g | ]}t                    S r   )rx   ).0rg   rG   s     r<   
<listcomp>z+VoxtralEncoder.__init__.<locals>.<listcomp>0  s"    $g$g$gQ%8%@%@$g$g$gr>   )r   )rJ   rK   r$   encoder_layerdrop	layerdroprz   num_mel_binspad_token_idr   max_source_positionsscale_embeddingmathsqrtembed_scaler   r   conv1conv2r   embed_positionsrequires_grad_
ModuleListrangeencoder_layerslayersr~   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rW   rG   rA   rP   s    ` r<   rK   zVoxtralEncoder.__init__  sX      ~1N	"/!.$*$?!393IR49Y///sYt0)TUVVV
Yy)1VWXXX
!|D,EyQQ++E222m$g$g$g$g%PVPeJfJf$g$g$ghh,v~66,q333&+#r>   c                 P    |                                  D ]	}d|_        
d| _        d S )NF)
parametersrequires_grad_requires_grad)rW   params     r<   _freeze_parametersz!VoxtralEncoder._freeze_parameters9  s4    __&& 	( 	(E"'E#r>   r`   c                     | j         S Nr   rW   s    r<   get_input_embeddingsz#VoxtralEncoder.get_input_embeddings>  s
    zr>   r!   c                     || _         d S r   r   rW   r!   s     r<   set_input_embeddingsz#VoxtralEncoder.set_input_embeddingsA  s    


r>   Nr9   c           	         | j         j        | j        j        d         z  | j        j        d         z  }|j        d         |k    r$t          d| d|j        d          d| d          |                    | j        j        j	        | j        j        j
                  }t          j                            |                     |                    }t          j                            |                     |                    }|                    ddd	          }| j        j        }||z                       |j	                  }t          j                            || j        | j        
          }t%          | j                  D ]\  }}	 |	||d          }
|
d         }|                     |          }t+          |          S )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   r'   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r   devicer   r   r,   N)r"   r^   )last_hidden_state)rG   r   r   r   r   r4   rM   tor   r   r   r   r5   gelupermuter   r$   r.   	enumerater   r   r   )rW   r   r"   r9   expected_seq_lengthinputs_embeds	embed_posr]   idxencoder_layerlayer_outputss              r<   rl   zVoxtralEncoder.forwardD  s   & #k>ARSTAUUX\XbXijkXll#'::: LM`  L  Ln|  oC  DF  oG  L  L  vI  L  L  L   (**1B1HQUQ[QbQi*jj**4::n+E+EFF**4::m+D+DEE%--aA66(/	&266}7JKK--mt|VZVc-dd"+DK"8"8 	- 	-C)M- $  M
 *!,MM66+
 
 
 	
r>   input_lengthsc                 6    |dz
  dz  dz   }|dz
  dz  dz   }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r   )rW   r   output_lengthss      r<    _get_feat_extract_output_lengthsz/VoxtralEncoder._get_feat_extract_output_lengthsu  s7     '*q014'!+1A5n,,r>   r   )rQ   rm   rn   ro   r   r   main_input_namer   r@   rx   _can_record_outputsrK   r   r   Moduler   r   r   r   r   rl   r0   
LongTensorr   ru   rv   s   @r<   r   r     s#          !   &O./&, 
3      4$ $ $
bi    ")      -
 -
 +,	-
 -
 -
 -
`-e>N - - - - - - - -r>   r   c                   *     e Zd Zdef fdZd Z xZS )VoxtralMultiModalProjectorrG   c                 6   t                                                       t          j        |j        j        |j        j        d          | _        t          |j
                 | _        t          j        |j        j        |j        j        d          | _        d S )NFrI   )rJ   rK   r   rR   r   intermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2r   s     r<   rK   z#VoxtralMultiModalProjector.__init__  sv    	&"5"GI[Ignsttt&56	&"4"@&BTB`glmmmr>   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )rW   audio_featuresr]   s      r<   rl   z"VoxtralMultiModalProjector.forward  s;    n55//m44r>   )rQ   rm   rn   r   rK   rl   ru   rv   s   @r<   r   r   ~  sZ        n} n n n n n n      r>   r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       e Zd ZdgZddiZddgdgfiZdgZ fdZd Zd	 Z	d
 Z
d Zd Zd Zdej        fdZdej        fdZee	 	 	 	 	 	 	 	 	 	 d deej                 deej                 deej                 deej                 dee         deej                 deej                 dee         deej                 deeej        f         dee         defd                        Z fdZ xZ S )!VoxtralForConditionalGenerationzlm_head.weightlm_headcolwise_repr]   logitsr   c                 4   t                                          |           |j        j        | _        t	          j        |j                  | _        t          j        |j                  | _	        t          |          | _        |                                  d S r   )rJ   rK   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   s     r<   rK   z(VoxtralForConditionalGeneration.__init__  s|        ,7$01DEE2>v?QRR%?%G%G" 	r>   c                 4    | j                                         S r   )r  r   r   s    r<   r   z4VoxtralForConditionalGeneration.get_input_embeddings  s    "77999r>   c                 :    | j                             |           d S r   )r  r   r   s     r<   r   z4VoxtralForConditionalGeneration.set_input_embeddings  s    0077777r>   c                 4    | j                                         S r   )r  get_output_embeddingsr   s    r<   r  z5VoxtralForConditionalGeneration.get_output_embeddings  s    "88:::r>   c                 :    | j                             |           d S r   )r  set_output_embeddings)rW   new_embeddingss     r<   r  z5VoxtralForConditionalGeneration.set_output_embeddings  s    11.AAAAAr>   c                 :    | j                             |           d S r   )r  set_decoder)rW   decoders     r<   r  z+VoxtralForConditionalGeneration.set_decoder  s    ''00000r>   c                 4    | j                                         S r   )r  get_decoderr   s    r<   r  z+VoxtralForConditionalGeneration.get_decoder  s    "..000r>   r   c                     |                      |          }|j        }|                    d| j        j        j                  }|                     |          }|S )a  
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        r'   )r  r   re   rG   r   r   r  )rW   r   audio_outputsaudio_hidden_statesaudio_embedss        r<   get_audio_featuresz2VoxtralForConditionalGeneration.get_audio_features  sX     ((88+=199"dk>V>hii112EFFr>   c                 `    t          j        dt                     |                     |          S )NzUThe method `get_audio_embeds` is deprecated. Please use `get_audio_features` instead.)warningswarnFutureWarningr  )rW   r   s     r<   get_audio_embedsz0VoxtralForConditionalGeneration.get_audio_embeds  s2    cer	
 	
 	
 &&~666r>   Nr   	input_idsr"   position_idsr   r   labels	use_cachecache_positionlogits_to_keepr9   r`   c                 t   | |                                  |          }||~|                     |          }|| j        j        k                        d          }|                    |                    |j                  |                    |j                            } | j        d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```Nr'   )r"   r%  r   r   r&  r'  r(  r)  r   )	r   r  rG   audio_token_id	unsqueezemasked_scatterr   r   r  )rW   r$  r   r"   r%  r   r   r&  r'  r(  r)  r9   r  audio_token_maskoutputss                  r<   rl   z'VoxtralForConditionalGeneration.forward  s    b  7D5577	BBM%)*?22>BBL !*T[-G GRRSUVV)88 ##M$899<??=K_;`;` M ,?4+> 
,
)%+'))
,
 
,
 
,
 
,
 r>   c                     |                     dd           }|                    d          } t                      j        |i |}||d         dk    r||d<   |S )Nr   r(  r   )popgetrJ   prepare_inputs_for_generation)rW   argsr9   r   r(  model_inputsrP   s         r<   r3  z=VoxtralForConditionalGeneration.prepare_inputs_for_generation  sl      $4d;;$455<uww<dMfMM%.*;q*@*@-;L)*r>   )
NNNNNNNNNr   )!rQ   rm   rn   _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictrK   r   r   r  r  r  r  r0   FloatTensorr  r#  r   r   r   r   rs   r	   rr   r   rp   r   r   r   rl   r3  ru   rv   s   @r<   r  r    sE        ++=)H_-z:;H$5#6     : : :8 8 8; ; ;B B B1 1 11 1 11B    *7u/@ 7 7 7 7  156:1537+/59-1$(5934F FE,-F !!23F !.	F
 u/0F "%F   12F )*F D>F !!12F c5</0F +,F 
 F F F ^ FP        r>   r  )r   r   r  )Nr   N)4r   r   typingr   r   r   r0   r   activationsr   cache_utilsr	   
generationr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   autor   r   configuration_voxtralr   r   
get_loggerrQ   rN   r   rs   rq   r=   r@   rx   r   r   r   r  __all__r   r>   r<   <module>rI     sp  ,   , , , , , , , , , ,        ! ! ! ! ! !             ) ) ) ) ) ) 9 9 9 9 9 9 ` ` ` ` ` ` ` ` ` ` F F F F F F F F & & & & & & R R R R R R R R R R R R / / / / / / 2 2 2 2 2 2 2 2 F F F F F F F F 
	H	%	%  $(,% %I%<% 
% <	%
 U\*% e_% % %% % % %<W) W) W) W) W)ry W) W) W)t<+ <+ <+ <+ <+4 <+ <+ <+~  ?  ?  ?  ?  ?_  ?  ?  ?F   
n- n- n- n- n-+ n- n- 
n-b          
S S S S S&<o S S 
Sl Z
Y
Yr>   