
    .`i+-                        d dl Z d dlmZ d dlmZmZ d dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 ddl1m2Z2  ee3          Z4 G d de          Z5 G d dej6        j7                  Z8dej9        de:dej9        fdZ; e j<        e5ee           G d d e                      Z=dS )!    N)Mapping)Literalcast)RawAudio)StreamingModeTranscriptionRequest)Audio)ModelConfigSpeechToTextConfig
VllmConfig)
PromptType)init_logger)MultiModalEmbeddings)VoxtralDummyInputsBuilderVoxtralForConditionalGenerationVoxtralMultiModalProcessorVoxtralProcessingInfo)MULTIMODAL_REGISTRY)_IBaseMultiModalProcessorCache)MultiModalKwargsOptionalItems)MultiModalDataItems)BaseDummyInputsBuilder)MultiModalPromptUpdatesPlaceholderFeaturesInfo)IntermediateTensors)cached_tokenizer_from_config   )_flatten_embeddingsc                        e Zd Zdddedee         dedz  ddf fdZded	ee	         d
e
dededeee	         eeee         f         f         fdZ xZS )#VoxtralStreamingMultiModalProcessorNcacheinfodummy_inputsr#   returnc                P    t                                          ||d            d S )Nr"   )super__init__)selfr$   r%   r#   	__class__s       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/voxtral_streaming.pyr)   z,VoxtralStreamingMultiModalProcessor.__init__1   s*     	|488888    mm_items
prompt_ids	mm_kwargsmm_prompt_updatesis_update_appliedc                 f   |                     dg           }t          |          dk    sJ d|            | j                                        }|j        j        j        }|d         d         j        j        d         }	|	                    |	          }
t          ddd|
dgz  d           }|d|gifS )Naudior   z;Expected only one audio input for streaming, got mm_kwargs=r   audio_arrays)modalityitem_idx	start_idxtokensis_embed)getlenr$   get_tokenizerinstructaudio_encoderaudio_configdatashapenum_audio_tokensr   )r*   r.   r/   r0   r1   r2   audios	tokenizerr@   num_audio_sampleslengthfeatures_infos               r,   _maybe_apply_prompt_updatesz?VoxtralStreamingMultiModalProcessor._maybe_apply_prompt_updates;   s     w++6{{aL	LL   I++--	 )7D"1In5:@C../@AA/c
 
 
 Gm_555r-   )__name__
__module____qualname__r   r   r   r)   r   listintr   r   booltupler   strr   rI   __classcell__r+   s   @r,   r!   r!   0   s         6:9 9 99 -R09
 ,d29 
9 9 9 9 9 96%6 I6 1	6
 36  6 
tCy'#t,C'D"DEE	F6 6 6 6 6 6 6 6r-   r!   c                   V     e Zd ZdZd
dededdf fdZdej        dej        fd	Z	 xZ
S )TimeEmbeddingz&Sinusoidal Embedding for encoding time     @dimthetar&   Nc                 `   t                                                       || _        || _        t	          j        t          j        | j                   t	          j        | j        dz            	                                z  | j        dz  z            }| 
                    d|d           d S )N   inv_freqF)
persistent)r(   r)   rW   rX   torchexpmathlogarangefloatregister_buffer)r*   rW   rX   r[   r+   s       r,   r)   zTimeEmbedding.__init___   s    
9Xdj!!!l48q=))//112x1}
 

 	ZeDDDDDr-   tc                     |d         }| j                             |j        |j                  }||z  }t	          j        |                                |                                fd          S )N).NdevicedtyperW   )r[   torg   rh   r]   catcossin)r*   rd   r[   embs       r,   forwardzTimeEmbedding.forwardj   sa    iL=##1817#CCL 	 y#''))SWWYY/R8888r-   )rV   )rJ   rK   rL   __doc__rN   rb   r)   r]   Tensorrp   rR   rS   s   @r,   rU   rU   \   s        00	E 	EC 	E 	ED 	E 	E 	E 	E 	E 	E9 9%, 9 9 9 9 9 9 9 9r-   rU   input_tensorscalingr&   c                     | |z  }t          j        || j                  }|                    d          |z                       d          S )N)rg   r   ri   )r]   ra   rg   	unsqueezeview)rs   rt   baseoffsetss       r,   _expand_tensorrz   s   sL    '!D l7<+>???G NN1'--b111r-   )r$   r%   c                       e Zd ZdZdddedef fdZed             Z	 d!ddd	d
e	j
        dedz  de	j
        dz  dede	j
        f
dZ	 	 d"d
e	j
        de	j
        dedz  de	j
        dz  dede	j
        ez  fdZdee	j
                 e	j
        z  ee	j
        df         z  dz  fdZedededefd            Zedej        dedededz  ded         dededz  defd             Z xZS )#VoxtralStreamingGenerationT )prefixvllm_configr~   c                P   t                                          ||           t          | j        j        j                  | _        | j        j        j	        j
        }|j        |j        z  dz  }|                                sJ d|             t          |          | _        d S )N)r   r~   rj   i  z$n_delay_tokens must be integer, got )r(   r)   rU   configtext_confighidden_sizetime_embeddingrE   r>   r?   r@   
frame_ratetranscription_delay_ms
is_integerrN   n_delay_tokens)r*   r   r~   r@   _n_delay_tokensr+   s        r,   r)   z#VoxtralStreamingGeneration.__init__   s    [@@@-:'3.
 .
 .
 ~.<I#l&IIDP 	 ))++ 	
 	
D?DD	
 	
+ "/22r-   c                 .    | j         j        j        j        S N)rE   r>   r?   r@   )r*   s    r,   r@   z'VoxtralStreamingGeneration.audio_config   s    ~&4AAr-   N)is_multimodalhandle_oov_mm_token	input_idsmultimodal_embeddingsr   r   r&   c                f    |J t          |          dk    s
J d            t          |          }|S )z+Pass post-conv embeddings directly as inputNr   zDFor streaming you must provide a multimodal_embedding at every step.)r<   r   )r*   r   r   r   r   mm_embeds_flats         r,   embed_input_idsz*VoxtralStreamingGeneration.embed_input_ids   sK     %000())A---R .-- --BCCr-   	positionsintermediate_tensorsinputs_embedskwargsc                    |J |J | j         j        j        }|                    |j        d         |z  |j        d         |z            }t          ||          }| j                            ||          }|j        \  }	}
|	| j        z  dk    sJ |                    |	| j        z  |
| j        z            }| 	                    |          }| j
                            |          }||z   }t          j        | j        g|j        |j                  }|                     |          }| j
                            |||||          }|S )Nr   r   rf   )r   t_cond)r   r@   block_pool_sizerw   rB   rz   whisper_encoderdownsample_factorreshapeaudio_language_adapterlanguage_modelr   r]   tensorr   rg   rh   r   model)r*   r   r   r   r   r   	pool_sizewhisper_positionsaudio_hidden_states
num_tokensaudio_hidden_sizeaudio_text_embedstext_embedstime_tensorr   hidden_statess                   r,   rp   z"VoxtralStreamingGeneration.forward   s    ((($$$K,<	%**"Y.0CA0F)0S
 
 +9i@@"2BB,
 
 )<(A%
%D22a7777199$00 66
 
 !778KLL)99)DD *K7l ! '%
 
 

 $$[11+11 ' 2 
 
 r-   .c                    	   j         di |}|
J d            dt          j        dt          dt          dt          j        fd fd|D             }fd	|D             }d
 |D             } j        j                            |          } j        j        j        |                    fd|D             d          } j        j	        j
        		fd|D             }	fd|D             }|S )zATransform audio waveforms -> initial whisper post-conv embeddingsNz<For streaming you must provide an audio input at every step.samplemult_ofposr&   c                     |dv s
J |            | j         |         |z  x}dk    r<|dk    r
| |d          n| d d |d f         } | j         |         dk    sJ d|             | S )Nr   r   r   z*Sample is empty after truncation with ctx rB   )r   r   r   ctxs       r,   _truncate_leftzCVoxtralStreamingGeneration.embed_multimodal.<locals>._truncate_left   s     &===#===|C(722q88),vaaag|C(1,,,FFF -,, Mr-   c                 ~    g | ]9}j                             |                              j         j                  :S  )r   compute_whisper_melspecrk   rh   ).0r4   r*   s     r,   
<listcomp>z?VoxtralStreamingGeneration.embed_multimodal.<locals>.<listcomp>   sU     
 
 
   88??BB$* 
 
 
r-   c                 *    g | ]} |d d          S )rZ   r   r   )r   melr   s     r,   r   z?VoxtralStreamingGeneration.embed_multimodal.<locals>.<listcomp>   s'    JJJcsAq11JJJr-   c                 (    g | ]}|j         d          S )r   r   )r   r   s     r,   r   z?VoxtralStreamingGeneration.embed_multimodal.<locals>.<listcomp>  s    999SCIaL999r-   c                     g | ]}|z  S r   r   )r   sconv_strides     r,   r   z?VoxtralStreamingGeneration.embed_multimodal.<locals>.<listcomp>	  s    000!Q+000r-   r   rj   c                 *    g | ]} |d           S )r   r   )r   r   r   r   s     r,   r   z?VoxtralStreamingGeneration.embed_multimodal.<locals>.<listcomp>  s7     '
 '
 '
 N69a00'
 '
 '
r-   c                 t    g | ]4}|                     |j        d          z  |j        d         z            5S r   )rw   rB   )r   er   s     r,   r   z?VoxtralStreamingGeneration.embed_multimodal.<locals>.<listcomp>  sM     '
 '
 '
 FF171:*AGAJ,BCC'
 '
 '
r-   r   ) _parse_and_validate_audio_arraysr]   rr   rN   r   forward_convtotal_stridesplitr   r@   r   )
r*   r   audio_inputsmel_featuresseq_lensaudio_embeddingsaudio_embeddings_per_sampler   r   r   s
   `      @@@r,   embed_multimodalz+VoxtralStreamingGeneration.embed_multimodal   s    =t<FFvFF''J (''
	L
	+.
	58
	\
	 
	 
	 
	
 
 
 
 &	
 
 
 KJJJ\JJJ99L999/?LL
 
 *:G&6&<&<0000x000a '= '
 '
#
 K,<	'
 '
 '
 '
 '
5'
 '
 '
#
'
 '
 '
 '
0'
 '
 '
# +*r-   model_config	task_typec                 t    t          |          }|j        j        j        }|j        }t          d |d           S )N)max_audio_clip_ssample_ratemin_energy_split_window_size)r   r>   r?   r@   sampling_rater   )clsr   r   rE   r@   r   s         r,   get_speech_to_text_configz4VoxtralStreamingGeneration.get_speech_to_text_config  sG     1>>	 )7D"0!!#)-
 
 
 	
r-   r4   
stt_configlanguage)
transcribe	translaterequest_promptto_languagec                    t          |          }t          |t          |j                  d          }t	          |j        t          j        |          |t          j	                  }	|j
                            |	          }
|
j        d         j        |j        f}dd|ii}|
j        |d<   t          t           |          S )Nwav)format)r   r4   r   	streamingr   multi_modal_datar4   prompt_token_ids)r   r	   rN   r   r   r   r   
from_audior   OFFLINEr>   encode_transcriptionrD   audio_arrayr9   r   r   )r   r4   r   r   r   r   r   r   rE   req	tokenizedprompts_dicts               r,   get_generation_promptz0VoxtralStreamingGeneration.get_generation_prompt'  s     1>>	eS!788GGG"$%e,,#+	
 
 
 &;;C@@	!!$0*2HI*We,<=+4+;'(J---r-   r   )NN)rJ   rK   rL   requires_raw_input_tokensr   rQ   r)   propertyr@   r]   rr   r   rO   r   r   objectrp   rM   rP   r   classmethodr
   r   r   npndarrayr   r   r   rR   rS   s   @r,   r|   r|      s[        !%AC 3 3 3z 33 3 3 3 3 3 3  B B XB >B
 .2$(  <  4d:
 |d* " 
   . <@-11 1<1 <1 2D8	1
 |d*1 1 
+	+1 1 1 1f7+	el	el	*U5<3D-E	E	L7+ 7+ 7+ 7+r 

&

36

	

 

 

 [

 .z. ". '	.
 *. 45. . 4Z. 
. . . [. . . . .r-   r|   )>r_   collections.abcr   typingr   r   numpyr   r]   &mistral_common.protocol.instruct.chunkr   -mistral_common.protocol.transcription.requestr   r   &mistral_common.tokens.tokenizers.audior	   vllm.configr
   r   r   vllm.inputs.datar   vllm.loggerr   %vllm.model_executor.models.interfacesr   "vllm.model_executor.models.voxtralr   r   r   r   vllm.multimodalr   vllm.multimodal.cacher   r   vllm.multimodal.inputsr   vllm.multimodal.parser   vllm.multimodal.processingr   $vllm.multimodal.processing.processorr   r   vllm.sequencer   vllm.tokenizersr   utilsr   rJ   loggerr!   nnModulerU   rr   rN   rz   register_processorr|   r   r-   r,   <module>r     s    # # # # # #                      ; ; ; ; ; ;        9 8 8 8 8 8 C C C C C C C C C C ' ' ' ' ' ' # # # # # # F F F F F F            0 / / / / / B B B B B B B B      6 5 5 5 5 5 = = = = = =        . - - - - - 8 8 8 8 8 8      
X		)6 )6 )6 )6 )6*D )6 )6 )6X9 9 9 9 9EHO 9 9 9.	2 	2 	2 	2 	2 	2 	2 (''	*  
}. }. }. }. }.!@ }. }. 
}. }. }.r-   