
    .`i              
          d dl Z d dlZd dlmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z!m"Z"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZEmFZFmGZGmHZH d dlImJZJmKZKmLZL d dlMmNZNmOZO d dlPmQZQmRZRmSZSmTZTmUZU d dlVmWZW d dlXmYZY d dlZm[Z[ d d!l\m]Z]m^Z^m_Z_ d d"l`maZambZb  e3ec          Zdd#d$d%d&d'd(d)d*d+d,	Ze G d- d.          Zf G d/ d0eR          Zg G d1 d2eNeg                   Zh G d3 d4eQeg                   Zi eBjj        eiegeh5           G d6 d7ejk        e^e9e]e_                      Zl G d8 d9ejk                  Zm G d: d;ejk                  ZndS )<    N)IterableMappingSequence)cached_propertypartial)ceil)Literalcast)mel_filter_bank)
AudioChunkRawAudio	TextChunk)UserMessage)ChatCompletionRequest)TranscriptionRequest)AudioAudioEncoderTranscriptionFormat)BatchFeature
TensorTypeWhisperConfig)	TextInput)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptType)init_logger)QuantizationConfig)default_weight_loader)
SupportsPP)MultiModelKeys)WhisperEncoder_create_fake_bias_for_k_proj)WhisperCausalEncoder)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsMultiModalUUIDDictNestedTensors)AudioProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderProcessorInputs)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)MistralTokenizer   )SupportsLoRASupportsMultiModalSupportsTranscription)init_vllm_registered_modelmaybe_prefixArabicDutchEnglishFrenchGermanHindiItalian
PortugueseSpanish)	arnlenfrdehiitptesc                   H    e Zd ZdZdeddf fdZedefd            Zede	fd            Z
ede	fd            Zede	fd	            Zedefd
            Zde	de	fdZ	 	 	 ddeee         z  dz  dej        eej                 z  dz  deez  dz  deeef         fdZ xZS )VoxtralProcessorAdapterzv
    Provide a HF-compatible interface for
    :class:`mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
    	tokenizerreturnNc                 V    t                                                       || _        d S N)super__init__rS   )selfrS   	__class__s     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/voxtral.pyrX   z VoxtralProcessorAdapter.__init__Z   s$    "    c                 V    | j         j        j        }t          |t                    sJ |S rV   )rS   instructaudio_encoder
isinstancer   )rY   r_   s     r[   _audio_processorz(VoxtralProcessorAdapter._audio_processor^   s+    /=-66666r\   c                 $    | j         j        j        S rV   )ra   special_idsaudiorY   s    r[   audio_token_idz&VoxtralProcessorAdapter.audio_token_idd   s    $066r\   c                 $    | j         j        j        S rV   )ra   rc   begin_audiore   s    r[   begin_audio_token_idz,VoxtralProcessorAdapter.begin_audio_token_idh   s    $0<<r\   c                 $    | j         j        j        S rV   )ra   audio_configsampling_ratere   s    r[   rl   z%VoxtralProcessorAdapter.sampling_ratet   s    $1??r\   c                 $    | j         j        j        S rV   )ra   rk   
frame_ratere   s    r[   rn   z"VoxtralProcessorAdapter.frame_ratex   s    $1<<r\   audio_lengthc                 @    t          || j        | j        z  z            S rV   )r   rl   rn   )rY   ro   s     r[   get_num_audio_tokensz,VoxtralProcessorAdapter.get_num_audio_tokens|   s!     LD$6$/$IJKKKr\   textaudiosreturn_tensorsc                    |g }t          |t                    s|g}|g }t          |t                    s|g}|s0|                     |          j        }dt	          j        |          iS t          d |D                       rt          d          t          t          j                             }t          t          j                             }|D ]7}t          |t          j
                  sJ |j        dk    sJ | j        j        j        t          j        k    rjt#          j        | j        j                  }	d|	j        v r#| j                            || j        d          }n | j                            || j                  }| j        g| j        g|                     t3          |                    z  z   }
|                    t	          j        |
                     |                    t	          j        |                     9t7          t	          j        |          d                              t3          |          d          |d	          S )
N	input_idsc              3   <   K   | ]}t          |          d k    V  dS )r   N)len).0ts     r[   	<genexpr>z3VoxtralProcessorAdapter.__call__.<locals>.<genexpr>   s,      ((as1vvz((((((r\   zYou've passed text inputs instead of token inputs. Make sure to process your input via `mistral_common`'s tokenizer or pass a chat completion request. For more info, see: https://github.com/vllm-project/vllm/issues/8411.r9   is_online_streamingF)r|   )rv   audio_arrays)r`   listrS   rv   torchtensorany
ValueErrorTensornpndarrayndimra   rk   transcription_formatr   	STREAMINGinspect	signaturepad
parametersrl   ri   rf   rq   rx   appendr   catexpand)rY   rr   rs   rt   kwargsrv   audios_tokensaudios_processedrd   sigaudio_tokenss              r[   __call__z VoxtralProcessorAdapter.__call__   sY    <D$%% 	6D>F&$'' 	XF 	:t,,6Ii!8!899 ((4((((( 	D   U\*,,-// 	9 	9EeRZ00000:????
 %2G&01 1 '(=(ABB(CN:: 155t1u 6  EE !155eT=OPPE 56#:))#e**55:6 6L   l!;!;<<<##EL$7$78888"Y}55d;BB3t99bQQ 0 
 
 	
r\   )NNN)__name__
__module____qualname____doc__r8   rX   r   r   ra   intrf   ri   rl   floatrn   rq   r   r   r   r   strr   r   r+   r   __classcell__rZ   s   @r[   rR   rR   T   s        
#"2 #t # # # # # # ,    _
 7 7 7 7 _7 =c = = = _= @s @ @ @ _@ =E = = = _=LL 
L L L L 487;26	>
 >
$y/)D0>
 
T"*--4>
 j(4/	>
 
m#	$>
 >
 >
 >
 >
 >
 >
 >
r\   rR   c                       e Zd ZdefdZdefdZdeee	dz  f         fdZ
de	deee	f         deee	f         fdZde	fd	Zde	fd
ZdS )VoxtralProcessingInforT   c                     t          | j        j                  }t          |t                    st          d          |S )Nz.This model requires `--tokenizer-mode mistral`)r7   ctxmodel_configr`   r8   r   )rY   rS   s     r[   get_tokenizerz#VoxtralProcessingInfo.get_tokenizer   s=    01FGG	)%566 	OMNNNr\   c                 D    t          |                                           S rV   )rR   r   re   s    r[   get_hf_processorz&VoxtralProcessingInfo.get_hf_processor   s    &t'9'9';';<<<r\   Nc                 
    ddiS )Nrd       re   s    r[   get_supported_mm_limitsz-VoxtralProcessingInfo.get_supported_mm_limits   s    |r\   seq_len	mm_countsc                 .    d|                                  iS Nrd   )get_max_audio_tokens)rY   r   r   s      r[   get_mm_max_tokens_per_itemz0VoxtralProcessingInfo.get_mm_max_tokens_per_item   s    
 224455r\   c                 $    | j         j        j        S rV   )r   r   max_model_lenre   s    r[   r   z*VoxtralProcessingInfo.get_max_audio_tokens   s    x$22r\   c                     |                                  }|                                 t          |j        |j        z            z  S rV   )r   r   r   rl   rn   )rY   	processors     r[   get_max_audio_array_lenz-VoxtralProcessingInfo.get_max_audio_array_len   sF    ))++	((**S#y';;.
 .
 
 	
r\   )r   r   r   r8   r   rR   r   r   r   r   r   r   r   r   r   r\   r[   r   r      s        /    ="9 = = = =cDj)A    66 38$6 
c		6 6 6 63c 3 3 3 3
 
 
 
 
 
 
r\   r   c            	           e Zd Zdeeef         defdZ	 d	dedeeef         deeef         dz  defdZ		 d	dedeeef         deeef         dz  de
fdZdS )
VoxtralDummyInputsBuilderr   rT   c                     dS )N r   )rY   r   s     r[   get_dummy_textz(VoxtralDummyInputsBuilder.get_dummy_text   s    rr\   Nr   
mm_optionsc                     |                     dd          }| j                                        }|r|                     d          nd }d|                     |||          iS )Nrd   r   )length
num_audios	overrides)getinfor   _get_dummy_audios)rY   r   r   r   r   target_lengthaudio_overridess          r[   get_dummy_mm_dataz+VoxtralDummyInputsBuilder.get_dummy_mm_data   sr     ]]7A..
	99;;5?I*..111T T++$ ,  
 	
r\   c                    | j                                         }|                     |          }|                     |||          }|                    dg           }g }d}	|D ]g}
t          |
| j                                         j        |	          }t          t          j
        |                    }|                    |           ht          t          t          |          g|          g          }|j                            |          }|j        }d |j        D             |d<   t'          ||	          S )
Nrd   wav)audio_arrayrl   format)input_audio)rr   )content)messagesc                     g | ]	}|j         
S r   )r   ry   as     r[   
<listcomp>zHVoxtralDummyInputsBuilder.get_dummy_processor_inputs.<locals>.<listcomp>  s    !D!D!DA!-!D!D!Dr\   )promptmm_data)r   r   r   r   r   r   r   rl   r   r   
from_audior   r   r   r   mistralencode_chat_completiontokensrs   r0   )rY   r   r   r   rS   
dummy_textdummy_mm_datadummy_audiosaudio_chunksr   rd   
audio_itemchunkrequestresdummy_tokenss                   r[   get_dummy_processor_inputsz4VoxtralDummyInputsBuilder.get_dummy_processor_inputs   sZ    I++--	((33
..w	:NN$(("55)+! 	' 	'E!"i88::H  J
 8+>z+J+JKKKE&&&&'YJ%?%?%?$O,$OPPP
 
 

 66w??z "E!D!D!D!DglMJJJJr\   rV   )r   r   r   r   r   r   r   r   r'   r   r0   r   r   r\   r[   r   r      s       S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
, =A	"K "K"K 38$"K C!112T9	"K
 
"K "K "K "K "K "Kr\   r   c                       e Zd Zdeeef         deeef         deeef         fdZde	deeef         de
dee         fdZ	 dd	eee         z  d
e	deeef         deeef         dedz  deee         eef         f fdZdefdZ xZS )VoxtralMultiModalProcessor	hf_inputshf_processor_mm_kwargsrT   c                 F    t          t          j        d                    S )Nrd   )r~   )dictr(   batched)rY   r   r   s      r[   _get_mm_fields_configz0VoxtralMultiModalProcessor._get_mm_fields_config  s!    
 !6!>w!G!GHHHHr\   mm_itemsout_mm_kwargsc                 ~     | j         j        di |j        dt          ffd}t	          dd|          gS )Nitem_idxc                                          dt                    }|                    |           }                    |          }g|z  S r   )	get_itemsr,   get_audio_lengthrq   )r   rs   	audio_lennb_audio_tokensaudio_idr   r   s       r[   get_replacementzGVoxtralMultiModalProcessor._get_prompt_updates.<locals>.get_replacement/  sL    ''1DEEF//99I'<<YGGO://r\   rd   r   )modalitytargetreplacementr   )r   r   rf   r   r4   )rY   r   r   r   r   r   r   s    `   @@r[   _get_prompt_updatesz.VoxtralMultiModalProcessor._get_prompt_updates%  s     /DI.HH1GHH	+	0c 	0 	0 	0 	0 	0 	0 	0 	0  +  
 	
r\   Nr   mm_data_itemstokenization_kwargsmm_uuidsc                 b    t                                          |||||          \  }}}||dfS )N)r   r   r   r   r   T)rW   _cached_apply_hf_processor)
rY   r   r   r   r   r   
prompt_idsmm_info_rZ   s
            r[   r   z5VoxtralMultiModalProcessor._cached_apply_hf_processor?  sI     "'!C!C'#9 3 "D "
 "

GQ 7D((r\   c                 ^    | j                                         j        }t          |          S )N)	target_sr)r   r   rl   r.   )rY   rl   s     r[   _get_data_parserz+VoxtralMultiModalProcessor._get_data_parserR  s)    	2244B#m<<<<r\   rV   )r   r   r   r   r   r+   objectr(   r   r-   r)   r   r5   r   r   r   r*   tupler3   boolr   r.   r  r   r   s   @r[   r   r     sk       I3-.I !(V 4I 
++	,	I I I I
%
 !(V 4
 -	

 
,	
 
 
 
@ /3) )d3i) +) !(V 4	)
 %S&[1) %t+) 
tCy2D8	9) ) ) ) ) )&="6 = = = = = = = =r\   r   )r   dummy_inputsc                   \    e Zd ZeZg dddgdZdddedef fd	Zd
e	fdZ
	 	 d(dej        dej        dedz  dej        dz  ded
ej        ez  fdZd
eej                 ej        z  eej        df         z  dz  fdZded
eej                 dz  fdZdej        d
ej        dz  fdZededed
efd            Zedej        dedededz  ded         ded edz  d
efd!            Zed"ededed
edz  fd#            Z d$e!eeej        f                  d
e"e         fd%Z#d&e$d
e$fd'Z% xZ&S ))VoxtralForConditionalGenerationq_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   prefixvllm_configr  c          	         t                                                       t          |j                  | _        t          |d          r|                     |j                  |_        |j        j        }|| _	        | j	        j
        j        | _        |                     |          5  t          ||j        t          |d                    | _        d d d            n# 1 swxY w Y   |                     |d          5  t%          |                    |j
                  t          |d                    | _        t+          |j
        j        | j        z  |j        j                  | _        d d d            d S # 1 swxY w Y   d S )Nquant_configlanguage_model)r  	hf_configr  rd   whisper_encoderr  )hidden_sizedim)rW   rX   r7   r   rS   hasattrmaybe_update_quant_configr  r  configrk   downsample_factor_mark_language_modelr=   text_configr>   r  _mark_tower_modelVoxtralEncoderModelwith_hf_configr  AudioLanguageAdapterd_modelr  audio_language_adapter)rY   r  r  r  rZ   s       r[   rX   z(VoxtralForConditionalGeneration.__init__f  s    5k6NOO ;// 	'+'E'E(( (K$ )3!%!9!K&&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ##K99 	 	#6**6+>??#F,=>>$ $ $D  +?"/7$:PP&2+ + +D'	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s%   (+CC#&C# A/E<<F F rT   c                 2    t          j        dddg          S )z?Get module prefix for multimodal models to filter LoRA modules.r  r'  r  )r  	connectortower_model)r"   from_string_fieldre   s    r[   get_mm_mappingz.VoxtralForConditionalGeneration.get_mm_mapping  s(    /+.*+
 
 
 	
r\   Nrv   	positionsintermediate_tensorsinputs_embedsr   c                 J    |d }| j                             ||||          }|S )N)r/  )r  model)rY   rv   r-  r.  r/  r   hidden_statess          r[   forwardz'VoxtralForConditionalGeneration.forward  s>      + M+11y"6m 2 
 
 r\   .c           	          | j         di |}|d S |                     |          }t          |          D ]\  }}|j        \  }}| j        t          j        || j        z            z  }t          j        j	        
                    |ddd||z
  f          }|                    || j        z  || j        z            ||<   t          j        |d          }	|                     |	          }	t          j        |	d |D             d          }|S )Nr   r  c                 (    g | ]}|j         d          S )r   )shaper   s     r[   r   zDVoxtralForConditionalGeneration.embed_multimodal.<locals>.<listcomp>  s    %K%K%KQagaj%K%K%Kr\   r   ) _parse_and_validate_audio_arraysr  	enumerater7  r  mathr   r   nn
functionalr   reshaper   r'  split)
rY   r   audio_inputsaudio_embeddingsiaudio_embeddingr   r  target_seq_lenaudio_embeddings_packeds
             r[   embed_multimodalz0VoxtralForConditionalGeneration.embed_multimodal  sS    =t<FFvFF4//=="+,<"="= 	 	A*0LGS!3di$007 7 N $h155Aq.723 O #2"9"9$"88#@V:V# #Q
 #(),<!"D"D"D"&"="=>U"V"V ;#%K%K:J%K%K%KQR
 
 
  r\   c                 2   |                     dd           }|d S t          |t          j        t          f          st          dt          |                     t          |t          j                  r"t	          |                    d                    }|S )Nr~   z*Incorrect type of audio_arrays. Got type: r   )popr`   r   r   r   r   typeunbind)rY   r   r~   s      r[   r8  z@VoxtralForConditionalGeneration._parse_and_validate_audio_arrays  s     zz.$774,t(<== 	QT,=O=OQQ   lEL11 	8 3 3A 6 677Lr\   r2  c                 6    | j                             |          S rV   )r  compute_logits)rY   r2  s     r[   rK  z.VoxtralForConditionalGeneration.compute_logits  s     "11-@@@r\   r   	task_typec                     t          |          }|j        j        j        }|j        }|j        }t          ||d           S )N)max_audio_clip_ssample_ratemin_energy_split_window_size)r7   r^   r_   rk   chunk_length_srl   r   )clsr   rL  rS   rk   rN  rO  s          r[   get_speech_to_text_configz9VoxtralForConditionalGeneration.get_speech_to_text_config  sP     1>>	 )7D'6"0!-#)-	
 
 
 	
r\   rd   
stt_configlanguage)
transcribe	translaterequest_promptto_languagec                 j   t          |          }t          |t          |j                  d          }t	          |j        t          j        |          |          }	|j        	                    |	          }
|
j
        d         j        |j        f}dd|ii}|
j        |d<   t          t          |          S )Nr   )r   )r1  rd   rU  r   multi_modal_datard   prompt_token_ids)r7   r   r   rO  r   r1  r   r   r^   encode_transcriptionrs   r   r   r
   r   )rR  rd   r   rT  rU  rL  rX  rY  rS   req	tokenizedprompts_dicts               r[   get_generation_promptz5VoxtralForConditionalGeneration.get_generation_prompt  s     1>>	eS!788GGG"$%e,,
 
 
 &;;C@@	!!$0*2HI*We,<=+4+;'(J---r\   audio_duration_sc                     t          |          }t          |          }|                    t          ||j        z                      S )z
        Map from audio duration to number of audio tokens produced by the ASR
        model, without running a forward pass.
        This is used for estimating the amount of processing for this audio.
        )r7   rR   rq   r   rO  )rR  rb  rT  r   rS   adapters         r[   rq   z4VoxtralForConditionalGeneration.get_num_audio_tokens  sJ     1>>	))44++ :#99::
 
 	
r\   weightsc                     g dt          t          j        d j        i                                                    t          d          t                       fd} j                             |                      D ]}	                    d|            d}|vr	                    |           S )N)z,mm_streams_embeddings.embedding_module\.(.*)\1)zmm_whisper_embeddings\.(.*)rh  )zaudio_language_projection\.(.*)zaudio_language_adapter.\1)z!audio_language_adapter\.0\.weightz"audio_language_adapter.w_in.weight)z!audio_language_adapter\.2\.weightz#audio_language_adapter.w_out.weightr'  z
.wk.weightc               3   ^  K   D ]%\  } }d}dD ]L}||                      |          o1|                      | d           o|                      | d           z  }M	D ]0\  }}t          j        ||           rt          j        |||           } 1|r5
j                            | |f          }                     d|             | v rZ|          }t          j                    5  t          ||           d d d            n# 1 swxY w Y                       |            | |fV  'd S )NF)mm_whisper_embeddingsz&mm_streams_embeddings.embedding_modulez.tok_embeddingsz.audio_language_projectionzwhisper_encoder.)

startswithre	fullmatchsubr  load_weightaddr   no_gradr    )namew
is_encoderkpatternreplparamaudio_paramsloaded_weightsremapping_rulesrY   re  s          r[   llm_weights_generatorzKVoxtralForConditionalGeneration.load_weights.<locals>.llm_weights_generator*  s     " $ $a"
  A ** R $10E0E0E F FFR $10P0P0P Q QQJJ &5 ; ;MGT|GT22 ;!vgtT:: /;;T1IFFD"&&'@$'@'@AAA<''(.E 8 8-eQ7778 8 8 8 8 8 8 8 8 8 8 8 8 8 8"&&t,,,,)OOOO7$ $s   &DD	
D	zlanguage_model.z6whisper_encoder.whisper_encoder.embed_positions.weight)
r   r;  
ModuleDictr'  named_parametersr$   setr  load_weightsrp  )rY   re  r|  rr  sin_keyry  rz  r{  s   ``   @@@r[   r  z,VoxtralForConditionalGeneration.load_weights  s   
 
 
 M,d.I    
 
 /wEE	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$> '445J5J5L5LMM 	9 	9D7778888 K.((w'''r\   r  c                    g d}t          |d          r]g }|j        D ]L}|}|D ]0\  }}t          j        ||          rt          j        |||          }1|                    |           M||_        t          |d          r|j        }|D ]t}	d||	         v r]g }
||	         d         D ]L}|}|D ]0\  }}t          j        ||          rt          j        |||          }1|
                    |           M|
||	         d<   u||_        |S )z
        Update quant config to so that ignored module and target module names
        match the vLLM model names.
        Right now this is specific for compressed-tensors format and
        load_format mistral.
        ))outputzlanguage_model.lm_head)zlayers\.(\d+)\.attention\.woz1language_model.model.layers.\1.self_attn.out_proj)zlayers\.(\d+)\.attention\.w(.*)z0language_model.model.layers.\1.self_attn.\2_proj)zlayers\.(\d+)\.feed_forward\.w1z,language_model.model.layers.\1.mlp.gate_proj)zlayers\.(\d+)\.feed_forward\.w2z,language_model.model.layers.\1.mlp.down_proj)zlayers\.(\d+)\.feed_forward\.w3z*language_model.model.layers.\1.mlp.up_proj)zSmm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)zBwhisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj)zPmm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wozCwhisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj)zWmm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)z9whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2)z6mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0z%whisper_encoder.whisper_encoder.conv1)z6mm_whisper_embeddings\.whisper_encoder\.conv_layers\.1z%whisper_encoder.whisper_encoder.conv2)z3mm_whisper_embeddings\.audio_language_projection\.0zaudio_language_adapter.w_in)z3mm_whisper_embeddings\.audio_language_projection\.2zaudio_language_adapter.w_outignoreconfig_groupstargets)r  r  rl  rm  rn  r   r  )rY   r  r{  mistral_ignorerr  mistral_namerv  rw  r  
group_namer  s              r[   r  z9VoxtralForConditionalGeneration.maybe_update_quant_configT  sr   2
 2
 2
j <** 	1N$+ 4 4#%4 C CMGT|GT22 C')vgtT'B'B%%l3333"0L <11 	7(6M+ 	? 	?
j 999 G -j 9) D 5 5'+-< K KMGT!|GT:: K/1vgtT/J/J|44447>j))44)6L&r\   )NN)'r   r   r   ISO639_1_SUPPORTED_LANGSsupported_languagespacked_modules_mappingr   r   rX   r"   r,  r   r   r6   r  r3  r   r  rE  r8  rK  classmethodr   r   rS  r   r   r	   r   ra  r   r   rq   r   r  r  r   r  r   r   s   @r[   r	  r	  W  s&        3 322$i0 
 BD   z 3      @
 
 
 
 
 <@-1 < < 2D8	
 |d*  
+	+   " 	el	el	*U5<3D-E	E	L       @	el	d	"    A|A 
	A A A A 
&
36
	
 
 
 [
 .z. ". '	.
 *. 45. . 4Z. 
. . . [.0 

 '
 "	

 
t
 
 
 [
"BHU33D-E$F B3s8 B B B BHW.W	W W W W W W W Wr\   r	  c                   P     e Zd Zdededdf fdZdej        dej        fdZ xZS )r%  r  r  rT   Nc                     t                                                       t          j        ||d          | _        t          j                    | _        t          j        ||d          | _        d S )NF)bias)rW   rX   r;  Linearw_inGELUgeluw_out)rY   r  r  rZ   s      r[   rX   zAudioLanguageAdapter.__init__  sZ    Ik3U;;;	GII	YsCe444


r\   xc                 x    |                      |                     |                     |                              S rV   )r  r  r  )rY   r  s     r[   r3  zAudioLanguageAdapter.forward  s*    zz$))DIIaLL11222r\   )	r   r   r   r   rX   r   r   r3  r   r   s   @r[   r%  r%    sx        5C 5c 5d 5 5 5 5 5 53 3%, 3 3 3 3 3 3 3 3r\   r%  c                   d    e Zd Zdg diZg dZdddededd	f fd
Zdej	        dej	        fdZ
edefd            Zedefd            Zdeej	                 deej	        ee         f         fdZdej	        eej	                 z  deej	                 fdZdeeej	        f         defdZ xZS )r#  r  r
  )rg  )z.whisper_encoder\.conv_layers\.0\.(weight|bias)whisper_encoder.conv1.\1)z.whisper_encoder\.conv_layers\.1\.(weight|bias)whisper_encoder.conv2.\1)z4whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)r  )z4whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)r  )zOwhisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)z.whisper_encoder.layers.\1.self_attn.\2_proj.\3)zIwhisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)z/whisper_encoder.layers.\1.self_attn.out_proj.\2)zJwhisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)z1whisper_encoder.layers.\1.self_attn_layer_norm.\2)zLwhisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)z$whisper_encoder.layers.\1.mlp.fc1.\2)zLwhisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)z$whisper_encoder.layers.\1.mlp.fc2.\2)zLwhisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w3\.(weight|bias)z$whisper_encoder.layers.\1.mlp.fc3.\2)zDwhisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)z-whisper_encoder.layers.\1.final_layer_norm.\2)z1whisper_encoder\.transformer\.norm\.(weight|bias)zwhisper_encoder.layer_norm.\1r   r  r  r  rT   Nc                ,   t                                                       t          t          |j        j                  | _        |j        j        | _        t          | j        dd          | _	        | j	        rt          }nt          t          d          } ||t          |d                    | _        t          d| j        j        dz  z   | j        j        d	d
| j        j                  }t'          j        |t&          j                  | _        d S )N	is_causalFT)init_in_fp32r  )r  r  r9      g        g     @@)num_frequency_binsnum_mel_binsmin_frequencymax_frequencyrl   )dtype)rW   rX   r
   r   r   r  r  r  getattrr  r%   r   r#   r>   r  r   window_sizer  rl   r   r   float32mel_filters)rY   r  r  WhisperEncoderClsr  rZ   s        r[   rX   zVoxtralEncoderModel.__init__  s    	=+*B*LMM"-":"@
 k5AA> 	K 4 'T J J J00#(9:: 
  
  
 & 4;#:a#??1 +3
 
 
 !<5=IIIr\   audio_waveformsc                    |j         }t          j        | j        j                                      |j                  }t          j        || j        j        | j        j        |d          }|dd df         	                                dz  }| j
        j        |z  }t          j        |d                                          }t          j        ||                                dz
            }|d	z   d	z  }|                    |          S )
NT)windowreturn_complex.r}   r  g|=)ming       @g      @)r  r   hann_windowr  r  todevicestft
hop_lengthabsr  Tclamplog10maximummax)rY   r  input_dtyper  r  
magnitudesmel_speclog_specs           r[   compute_whisper_melspecz+VoxtralEncoderModel.compute_whisper_melspec  s     &+"4;#:;;>>?UVVzK#K"
 
 
 #ss(^''))Q.
#%
2;xU33399;;=8<<>>C+?@@sNc){{;'''r\   c                 `    | j         j        j        d         | j         j        j        d         z  S )Nr   )r  conv1strideconv2re   s    r[   r  z%VoxtralEncoderModel.downsample_factor   s.      &-a043G3M3TUV3WW	
r\   c                 *    | j         j        | j        z  S rV   )r  max_source_positionsr  re   s    r[   
chunk_sizezVoxtralEncoderModel.chunk_size&  s    {/$2HHHr\   c                     t          |t                    sJ  fd|D             }g }g }|D ]E}|                     j        d          }||z  }|                    t          |                     Ft          j        |          |fS )Nc                 j    g | ]/}                     |                              j                  0S r   )r  r  r  )ry   rd   rY   s     r[   r   z?VoxtralEncoderModel.prepare_inputs_for_conv.<locals>.<listcomp>0  sG     
 
 
 ((//224:>>
 
 
r\   r}   r5  )r`   r   r>  r  r   rx   r   stack)rY   r  input_featureschunked_featureschunks_per_examplefeaturechunkss   `      r[   prepare_inputs_for_convz+VoxtralEncoderModel.prepare_inputs_for_conv*  s     /400000
 
 
 
(
 
 

 02(*% 	3 	3G]]4?];;F&%%c&kk2222 {+,,.@@@r\   r  c                    t          |t                    s|g}|                     |          \  }}|                     |g          }d}g }|D ]=}||||z                                dd          }|                    |           ||z  }>|S )Nr   r9   )r`   r   r  r  flattenr   )	rY   r  input_embedsr  out	chunk_idxresultsn_chunksresults	            r[   r3  zVoxtralEncoderModel.forward?  s     .$// 	.,-N ,0+G+G+W+W(( ""L>22 	* 	" 	"HY%99:BB1aHHFNN6"""!IIr\   weightc                 R   g d}g }| j         r-|                    ddg           |                    dg           t          |                                           }|\  }}| j        D ]0\  }}t          j        ||          rt          j        |||          }1|D ]>\  }	}
}|
|vr|                    |
|	          }||         }|j	        } ||||            nM|D ] \  }	}
|
|vr
|                    |
|	          }!||         }t          |dt                    } |||           |S )N))r  r  q)r  r  ru  )r  r  v).mlp.gate_up_projz.mlp.fc1r   )r  z.mlp.fc3r9   )z.mlp.down_projz.mlp.fc2weight_loader)r  extendr   r~  mistral_remappingrl  rm  rn  replacer  r  r    )rY   r  stacked_params_mappingparams_mappingparams_dictrr  loaded_weightrv  rw  
param_nameweight_nameshard_idrx  r  s                 r[   ro  zVoxtralEncoderModel.load_weightU  s   "
 "
 "
 > 	 #))88   !!2  
 4002233$m!3 	3 	3MGT|GT** 3vgtT221G 	0 	0-JX$&&<<Z88D%E!/MM%999E+9 = ='
Kd**||K<<%E#E?<QRRMM%///r\   )r   r   r   r  r  r   r   rX   r   r   r  propertyr   r  r  r   r  r  r3  ro  r   r   s   @r[   r#  r#    s       (*H*H*HI2 2 2p 	J J JJ 	J
 
J J J J J J8(( 
( ( ( (( 
3 
 
 
 X

 IC I I I XIAel+A 
u|T#Y&	'A A A A*#lT%,-??	el	   ,1%U\(9": 1s 1 1 1 1 1 1 1 1r\   r#  )or   r:  collections.abcr   r   r   	functoolsr   r   r   typingr	   r
   numpyr   regexrl  r   torch.nnr;  mistral_common.audior   &mistral_common.protocol.instruct.chunkr   r   r   )mistral_common.protocol.instruct.messagesr   (mistral_common.protocol.instruct.requestr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   r   r   transformersr   r   r   $transformers.tokenization_utils_baser   vllm.configr   r   r   vllm.config.multimodalr   vllm.inputs.datar   vllm.loggerr   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr    vllm.model_executor.modelsr!   )vllm.model_executor.models.module_mappingr"   "vllm.model_executor.models.whisperr#   r$   )vllm.model_executor.models.whisper_causalr%   vllm.multimodalr&   vllm.multimodal.inputsr'   r(   r)   r*   r+   vllm.multimodal.parser,   r-   r.   vllm.multimodal.processingr/   r0   $vllm.multimodal.processing.processorr1   r2   r3   r4   r5   vllm.sequencer6   vllm.tokenizersr7   vllm.tokenizers.mistralr8   
interfacesr:   r;   r<   utilsr=   r>   r   loggerr  rR   r   r   r   register_processorModuler	  r%  r#  r   r\   r[   <module>r     s     7 7 7 7 7 7 7 7 7 7 . . . . . . . .                                      0 0 0 0 0 0 R R R R R R R R R R A A A A A A J J J J J J N N N N N N         
 A @ @ @ @ @ @ @ @ @ : : : : : : C C C C C C C C C C 3 3 3 3 3 3 ' ' ' ' ' ' # # # # # # F F F F F F O O O O O O 1 1 1 1 1 1 D D D D D D        K J J J J J / / / / / /                      
 O N N N N N N N              . - - - - - 8 8 8 8 8 8 4 4 4 4 4 4 O O O O O O O O O O ; ; ; ; ; ; ; ;	X		 








 
 l
 l
 l
 l
 l
 l
 l
 l
^
 
 
 
 
. 
 
 
>8K 8K 8K 8K 8K 67L M 8K 8K 8Kv7= 7= 7= 7= 7=!89N!O 7= 7= 7=t ('	*  
O O O O OI!:|=RO O 
Od
3 3 3 3 329 3 3 3M M M M M") M M M M Mr\   