
    .`iv                        U d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
l m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA 	 d dlBZBn# eC$ r  eAd          ZBY nw xY we&e+z  ZDeeEd<   e(e-z  ZFeeEd<   e)e.z  ZGeeEd<    edeD          ZH edeF          ZI edeG          ZJe&e+z  e(z  e-z  ZKeeEd<    e5eL          ZM G d de!          ZNdS )     N)AsyncGeneratorCallable)cached_property)Literal	TypeAliasTypeVarcast)Request)PreTrainedTokenizerBase)EngineClient)RequestLogger)DeltaMessageErrorResponseRequestResponseMetadata	UsageInfo)OpenAIServingSpeechToTextRequest)OpenAIServingModels)
TranscriptionResponse!TranscriptionResponseStreamChoiceTranscriptionResponseVerboseTranscriptionSegmentTranscriptionStreamResponseTranslationResponseTranslationResponseStreamChoiceTranslationResponseVerboseTranslationSegmentTranslationStreamResponse)VLLMValidationError)
PromptType)init_logger)FlatLogprobsLogprob)SupportsTranscriptionsupports_transcription)RequestOutput)get_tokenizer)PlaceholderModulelibrosaSpeechToTextResponseSpeechToTextResponseVerboseSpeechToTextSegmentT)boundVSResponseTypec                       e Zd ZdZddddddedededz  d	ed
ed         dedef fdZ	d0dZ
d0dZedee         fd            Zdededeee         ef         fdZ	 d1dedeeeeef                  z  dedee         dedee         fdZdedededeeez           de de!e"df         f         deez  e!e"df         z  e#z  fd Z$ded!ee!e%df                  d"e"d#e&d$ed%ed&         d'ee'         ee(         z  d(ee)         ee*         z  de!e"df         fd)Z+de,j-        d*edee,j-                 fd+Z.d,e,j-        d-ed.edefd/Z/ xZ0S )2OpenAISpeechToTextzPBase class for speech-to-text operations like transcription and
    translation.F
transcribe)return_tokens_as_token_ids	task_typelog_error_stackenable_force_include_usageengine_clientmodelsrequest_loggerNr5   r6   r4   	translater7   r8   c                H   t                                          |||||           | j                                        | _        || _        | j                            | j        |          | _        || _	        t          j        | _        | j        j        r=t          t          t!          | j        j        | j        j                            | _        | j        r t&                              d| j                   |                                  |                                  d S )N)r9   r:   r;   r5   r7   )tokenizer_nametokenizer_modez6Overwriting default completion sampling param with: %s)super__init__model_configget_diff_sampling_paramdefault_sampling_paramsr6   	model_clsget_speech_to_text_config
asr_configr8   envsVLLM_MAX_AUDIO_CLIP_FILESIZE_MBmax_audio_filesize_mbsupports_segment_timestampr	   r   r'   	tokenizerr@   loggerinfo_warmup_audio_preprocessing_warmup_input_processor)	selfr9   r:   r;   r5   r6   r7   r8   	__class__s	           /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/entrypoints/openai/translations/speech_to_text.pyrB   zOpenAISpeechToText.__init__L   s)    	')'A+ 	 	
 	
 	
 (,'8'P'P'R'R$".BBy
 
 +E'%)%I">4 	!'#'#4#>#'#4#C   DN ' 	KKH,   	((***$$&&&&&    returnc                    t          t          t                    rdS t          | j                  sdS 	 t          j                    }t                              d           t          j
        t          | j        j                  t          j                  }t          j        || j        j                  }ddlm}  || j                  }d}t'          |d          r|j        }n.t'          |d          r|j        }t'          |d          r|j        }|[t          j                            || j        j        t1          |d	d
          t1          |dd          t1          |dd                    }t          j                    |z
  }t                              d|           dS # t2          $ r t                              d           Y dS w xY w)a:  Warm up audio processing libraries to avoid first-request latency.

        The first call to librosa functions (load, get_duration, mel-spectrogram)
        triggers JIT compilation and library initialization which can take ~7s.
        This method warms up these operations during server initialization.
        Nz+Warming up audio preprocessing libraries...dtypeysrr   )cached_processor_from_configfeature_extractoraudio_processorn_mels   n_ffti  
hop_length   )r[   r\   r`   rb   rc   z-Audio preprocessing warmup completed in %.2fsz_Audio preprocessing warmup failed (non-fatal): %s. First request may experience higher latency.)
isinstancer)   r(   r%   rF   timeperf_counterrN   rO   npzerosintrH   sample_ratefloat32get_duration!vllm.transformers_utils.processorr]   rC   hasattrr^   r_   featuremelspectrogramgetattr	Exception	exception)	rR   warmup_startdummy_audio_r]   	processorr^   
audio_procwarmup_elapseds	            rT   rP   z.OpenAISpeechToText._warmup_audio_preprocessing}   s    g011 	F &dn55 	F,	,..LKKEFFF (3t'B#C#C2:VVVK ${t7RSSSA      54T5FGGI $y"566 E$-$?!!$566 E&6
:':;; E(2(D% !,O22!2"#4hDD!"3WcBB&'8,LL 3   ".00<?NKKGXXXXX 	 	 	?     	s   E0F& &$GGc           	         t          | j                  sdS t          | j        d          sdS 	 ddlm} t          j                    }t                              d           t          j
        t          | j        j                  t          j                  }| j                            || j        | j        d| j        dd	          } |d
dd          }| j                            d||          }t          j                    |z
  }t                              d|           dS # t(          $ r t                              d           Y dS w xY w)a<  Warm up input processor with dummy audio to avoid first-request latency.

        The first call to input_processor.process_inputs() with multimodal audio
        triggers multimodal processing initialization which can take ~2.5s.
        This method processes a dummy audio request to warm up the pipeline.
        Nget_generation_promptr   )SamplingParamsz(Warming up multimodal input processor...rX   en audio
stt_configrC   languager6   request_promptto_language           T)
max_tokenstemperature
skip_clonewarmup)
request_idpromptparamsz)Input processor warmup completed in %.2fsz[Input processor warmup failed (non-fatal): %s. First request may experience higher latency.)r%   rF   ro   vllm.sampling_paramsr}   rf   rg   rN   rO   rh   ri   rj   rH   rk   rl   r|   rC   r6   input_processorprocess_inputsrs   rt   )rR   r}   ru   rv   dummy_promptdummy_paramsrw   rz   s           rT   rQ   z*OpenAISpeechToText._warmup_input_processor   s    &dn55 	F t~'>?? 	F+	;;;;;;,..LKKBCCC (3t'B#C#C2:VVVK  >??!?!..!  @  L *>  L $33### 4  A ".00<?NKKC^TTTTT 	 	 	?     	s   C6D' '$EEc                 n    ddl m}  || j                  }t          t          t
                   |          S )Nr   )get_model_cls) vllm.model_executor.model_loaderr   rC   r	   typer$   )rR   r   rF   s      rT   rF   zOpenAISpeechToText.model_cls   s<    BBBBBB!M$"344	D./;;;rU   request
audio_datac           
        K   | j                             |j                  }|j        r| j                             |j                  nd }t	          |          dz  | j        k    r"t          ddt	          |          dz            t          j        |          5 }t          j
        || j        j                  \  }}d d d            n# 1 swxY w Y   t          j        ||          }| j        j        o|| j        j        k    }	|	s|gn"|                     |t#          |                    }
g }|
D ]}| j                             || j        | j        || j        |j        |          }|j        dk    rt/          |t0                    s$t          d	d
t3          |          j                  t7          t0          |          }|                    d          }t/          |t:                    s$t          ddt3          |          j                  |                    dd          |d<   |                    |           ||fS )Ni   zMaximum file size exceededaudio_filesize_mb)	parametervalue)r\   rZ   r   verbose_jsonzExpected prompt to be a dictr   decoder_promptz!Expected decoder_prompt to be strz<|notimestamps|><|0.00|>) rF   validate_languager   r   lenrK   r   ioBytesIOr)   loadrH   rk   rm   allow_audio_chunkingmax_audio_clip_s_split_audiorj   r|   rC   r6   r   response_formatre   dictr   __name__r	   getstrreplaceappend)rR   r   r   r   r   bytes_r[   r\   durationdo_split_audiochunkspromptschunkr   prompt_dictr   s                   rT   _preprocess_speech_to_textz-OpenAISpeechToText._preprocess_speech_to_text   s      >33G4DEE "DN,,W-@AAA 	 z??W$t'AAA%,-*oo/    Z
## 	Iv LDO,GHHHEAr	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I
 '!333O0 <4?;; 	 +M!0A0A!SWW0M0M 	# 	#E ^99?!.!.&~' :  F &.88!&$// -6"*"6ll3   
 #400!',<!=!=!.#66 -;"2">22;   
 1?0F0F&
1 1,- NN6""""  s   $CCCr   tokens	log_probssegment_class
start_timec                    d}| j                             dd          d         }|d         | j         j        k    r
|dd         }|f|z   }g }	d}
|d         |k     r|d         |k    r||d         fz   }d	}t          d
t	          |                    D ].}||         }||k    r||d
z
           |k    r||
|         }|d         |z
  }|d         |z
  }| j                             |d
d                   }|                    d          }t          t           |t	          |	          ||||z  z   |||z  z   |j        |t	          |          t	          t          j
        |                    z  |d
d         |||
z
  z  	  	                  }|	                    |           |}
d}|||d
z
           |         j        z  }0|	S )a  
        Convert tokens to verbose segments.

        This method expects the model to produce
        timestamps as tokens (similar to Whisper).
        If the tokens do not include timestamp information,
        the segments may not be generated correctly.

        Note: No_speech_prob field is not supported
        in this implementation and will be None. See docs for details.
        g{Gz?r   F)add_special_tokensr   Nr   r   zutf-8)	idseekstartendr   textcompression_ratior   avg_logprob)rM   encodeeos_token_idranger   decoder	   r,   r   zlibcompressr   logprob)rR   r   r   r   r   r   BASE_OFFSET
init_tokentokens_with_startsegmentslast_timestamp_startr   idxtokensliced_timestamp_tokensstart_timestampend_timestampr   
text_bytescasting_segments                       rT   _get_verbose_segmentsz(OpenAISpeechToText._get_verbose_segments?  s!   & ^**:%*PPQRS
":444CRC[F'MF2.0 R :--2CB2G:2U2U 15Fr5J4L LC 12233 "	A "	AC &c*E
""'8q'AZ'O'O*;<PQT<T*U'"9!"<z"I 7 ;j H~,,-DQrT-JKK![[11
"&'!Mx=='(;+HH&})DD$+$7!
 +.j//dmJ7788+96qt<$/39M3M$N  # #& 000'*$yq1%8@@rU   raw_requestresponse_classstream_generator_method.c           
      
   K                         |           d{V }||S  j        j        r j        j        |j        dvr                     d          S |j        dk    r) j        j        s                     d|j                   S |j        dk    r|j	        r                     d          S  j
         d                     |           t                    }|r||j        _        	                      |                               ||	           d{V \  }}	nF# t"          $ r9}
t$                              d
                                |
          cY d}
~
S d}
~
ww xY wd}	 |j         j        j        }nt/           j        j        |j                  }|                    | j                  |j        dk    rd_                             |j                    fdt;          |          D             }n,# t"          $ r}
                     |
          cY d}
~
S d}
~
ww xY w|j	        r |||||	          S g }g }	 |J t<          t>          d}| j
                 }d} j         j!        }|tE          |          dk    s
J d            t;          |          D ]\  }}|tG          ||z            nd}|2 3 d{V }|j        dk    r|j$        d         j        sJ  %                    tM          |j$        d         j'                  ||||j$        d         j                  }|(                    |           |(                    d |D                        |)                    |j$        d         j*                   6 d+                    |          } j
        dk    rdtY          t[          j.        |	                    d}|j        dk    r%t_          t`          tc          ||                    }nt_          td          tg          ||j4        tk          |	          |                    }ng|j        dk    r$t_          t`          tm          |                    }n8t_          td          to          ||j4        tk          |	          |                    }|S # tp          j9        $ r                      d          cY S t"          $ r}
                     |
          cY d}
~
S d}
~
ww xY w)zUBase method for speech-to-text operations like transcription and
        translation.N)r   jsonr   zFCurrently only support response_format`text`, `json` or `verbose_json`r   z*Currently do not support verbose_json for z2verbose_json format doesn't support streaming case-)r   )r   r   z$Error in preprocessing prompt inputsr   )r   lora_requestc           	      \    g | ](\  }}j                             | d |           )S )rw   )r   )r9   generate).0ir   r   r   sampling_paramsrR   s      rT   
<listcomp>z=OpenAISpeechToText._create_speech_to_text.<locals>.<listcomp>  sa     % % % Av "++#!''A''!-	 ,  % % %rU   r<   r   z:`max_audio_clip_s` is set to None, audio cannot be chunkedr   r   )r   r   r   r   r   c                     g | ]	}|j         
S  r   )r   segs     rT   r   z=OpenAISpeechToText._create_speech_to_text.<locals>.<listcomp>  s    *H*H*H38*H*H*HrU   r4   r   )r   seconds)r   usage)r   r   r   r   r   zClient disconnected):_check_modelr9   errored
dead_errorr   create_error_responserF   rL   modelstreamr6   _base_request_idr   staterequest_metadata_maybe_get_adaptersr   
ValueErrorrN   rt   max_completion_tokensrC   max_model_lenminto_sampling_paramsrE   logprobs_log_inputsr   	enumerater   r   rH   r   r   floatoutputsr   tuple	token_idsextendr   r   joinrj   mathceilr	   r-   r   r/   r   r   r   r   r   asyncioCancelledError)rR   r   r   r   r   r   error_check_retr   r   
duration_selist_result_generatordefault_max_tokenstotal_segments
text_partssegments_typesr   r   chunk_size_in_sr   result_generatorr   opr   r   final_responser   r   r   s   `                         @@@rT   _create_speech_to_textz)OpenAISpeechToText._create_speech_to_text  s      !% 1 1' : :::::::&""
 % 	0$//"*JJJ--7   #~55N= 6 --LW]LL   "n444--D   MM)>)>{)K)KMM
2jIII 	B1AK.
	133G<<L(,(G(G% )H ) ) # # # # # #GZZ
  	1 	1 	1CDDD--a00000000	1 SW#	1
 ,4%)%6%D""%(%3W5R& &" &88"D$@ O &.88+,(&)    % % % % % % % "+7!3!3% % %!!  	1 	1 	1--a00000000	1 > 	**.
<Lj   
K	1(4442/D DN 8Fdn7UMD"o>O&011Q666P 766 *33H)I)I > >%%4C4OE#/000UX  !1 > > > > > > >"..@@!z!}5555 66',RZ]-D'E'E.;(/+5*,*Q-*@ 7   ! '--h777"))*H*Hx*H*H*HIIII"))"*Q-*<====! !1 0" 77:&&D~-- '"49Z#8#899 
 *n<<%)0d%HHH& &NN &*4!%%,%5%(__%3	  & &NN *n<<%)!-@d-K-K-K%L%LNN%)2!%%,%5%(__%3	  & &N "!% 	E 	E 	E--.CDDDDD 	1 	1 	1--a00000000	1so   55D+ +
E.5.E)#E.)E.4BH 
H:H5/H:5H:A9R1 NGR1 1$S?	S? S:4S?:S?r  r   r   audio_duration_schunk_object_type)ztranslation.chunkztranscription.chunkresponse_stream_choice_classstream_response_classc	           	     h  K   t          t          j                              }	|j        }
d}d}| j        p|j        }|r|j        r|j        nd}	 |D ]"}|2 3 d {V }|j        At          |j                  }| j        	                    || j
        | j                  x}r||z  }t          |j                  dk    sJ |j        d         }t          |j                  }|t          |j                  z  }|j         ||          }n |||j        |j                  } ||||	|g|
          }|rt%          ||||z             |_        |                    d	
          }d| dW V  6 $|rFt%          ||||z             } ||||	g |
|          }|                    d	d	          }d| dW V  t%          ||||z             |_        nU# t,          $ rH}t.                              d| j                   |                     |          }d| dW V  Y d }~nd }~ww xY wdW V  d S )Nr   Fr   )content)delta)r  finish_reasonstop_reason)r   objectcreatedchoicesr   )prompt_tokenscompletion_tokenstotal_tokensT)exclude_unsetzdata: z

)r   r   r!  r"  r   r   )r&  exclude_nonezError in %s stream generator.zdata: [DONE]

)rj   rf   r   r8   stream_include_usagestream_continuous_usage_statsprompt_token_idsr   rF   get_num_audio_tokensrH   rC   r  r   r   r  r  r  r   r   model_dump_jsonfinal_usage_infors   rN   rt   r6   create_streaming_error_response)rR   r   r  r   r   r  r  r  r  created_time
model_namer$  num_prompt_tokensinclude_usageinclude_continuous_usager  resaudio_tokensoutputdelta_messagechoice_datar   datafinal_usagefinal_usage_chunkfinal_usage_datar  s                              rT    _speech_to_text_stream_generatorz3OpenAISpeechToText._speech_to_text_stream_generator5  sv      49;;'']
7W7;W !(!FG11 	!T	&$9 1. 1. !1 0. 0. 0. 0. 0. 0. 0.#+7,/0D,E,E)+/>+N+N,dot?P, , < > .=- s{++q0000 [^F$0$E$E$EM%V-=)>)>>%+3&B&B&W&W&W 'C&B"/*0*>(.(:' ' ' 21%0 ,!,(  E 0 &/*;.?):=N)N' ' ' !00t0DDD-4--------a "2!1h  6'"3&7!25F!F   %:$9!,($%% % %! $5#D#D"&T $E $ $  6/5555555 1:/"3.1BB1 1 1--  	& 	& 	&<dnMMM77::D%4%%%%%%%%%%%%	&
 !      s%   G E2E;G 
H*">H%%H*rk   c                    | j         j        J d| j         j        d            || j         j        z  }|| j         j        z  }g }d}||j        d         k     r||z   |j        d         k    r |                    |d|d f                    no||z   |z
  }t          ||z   |j        d                   }|                     |||          }	|                    |d||	f                    |	}||j        d         k     |S )Nz!self.asr_config.max_audio_clip_s=z+ cannot be None to split audio into chunks.r   r   .)rH   r   overlap_chunk_secondshaper   r   _find_split_point)
rR   r   rk   
chunk_sizeoverlap_sizer   r   search_start
search_endsplit_points
             rT   r   zOpenAISpeechToText._split_audio  s3    /;;(t/ ( ( ( <;; !4?#CC
"T_%II*"2&&&:~!1"!555jabb1222 z>L8LQ^Z-=b-ABBJ00\:VVK MM*S!K-%78999A *"2&&& rU   wav	start_idxend_idxc                    |||         }t           j        }d}| j        j        }|J t	          dt          |          |z
  |          D ]6}||||z            }	|	dz                                  dz  }
|
|k     r||z   }|
}7|S )a.  Find the best point to split audio by
        looking for silence or low amplitude.
        Args:
            wav: Audio tensor [1, T]
            start_idx: Start index of search region
            end_idx: End index of search region
        Returns:
            Index of best splitting point
        r   N   g      ?)r  infrH   min_energy_split_window_sizer   r   mean)rR   rG  rH  rI  segment
min_energyquietest_idxmin_energy_windowr   windowenergys              rT   rA  z$OpenAISpeechToText._find_split_point  s     i'( X
 OH ,,,q#g,,)::<MNN 	$ 	$AQ%6!667Fai%%''3.F
"" 9}#
rU   )rV   N)r   )1r   
__module____qualname____doc__r   r   r   boolr   rB   rP   rQ   r   r   r$   rF   r   bytesr  listr    r   r   r"   r   rj   r#   r,   r   r
   r-   r/   r   r   r   r   r  r&   r   r   r   r   r   r=  rh   ndarrayr   rA  __classcell__)rS   s   @rT   r3   r3   H   s         ,18D %+0/' /' /'#/' $/'
 &,/' %)/' 45/' /' %)/' /' /' /' /' /'b; ; ; ;z: : : :x <4 56 < < < _<@!$@! @! 
tJ&	'	@! @! @! @!P B BB  $tCL'9"::B %	B
 /0B B 
!	"B B B BHp1p1 %p1 	p1
 QUp1 "*#~c4i/H*H!Ip1 
QT	*	*]	:p1 p1 p1 p1dp!$p!  $N=$3F$GHp! 	p!
 2p!  p! ##MNp! '++L&M
.
/'0p!  $$?@
(
) *p! 
T		"p! p! p! p!d*36	bj	   6RZ C # RU        rU   r3   )Or  r   r  rf   r   collections.abcr   r   	functoolsr   typingr   r   r   r	   numpyrh   fastapir
   transformersr   	vllm.envsrI   vllm.engine.protocolr   vllm.entrypoints.loggerr   'vllm.entrypoints.openai.engine.protocolr   r   r   r   &vllm.entrypoints.openai.engine.servingr   r   &vllm.entrypoints.openai.models.servingr   -vllm.entrypoints.openai.translations.protocolr   r   r   r   r   r   r   r   r   r   vllm.exceptionsr   vllm.inputs.datar    vllm.loggerr!   vllm.logprobsr"   r#   vllm.model_executor.modelsr$   r%   vllm.outputsr&   vllm.tokenizersr'   vllm.utils.import_utilsr(   r)   ImportErrorr*   __annotations__r+   r,   r-   r/   r0   r1   r   rN   r3   r   rU   rT   <module>rt     s    				    4 4 4 4 4 4 4 4 % % % % % % 4 4 4 4 4 4 4 4 4 4 4 4           0 0 0 0 0 0       - - - - - - 1 1 1 1 1 1            V U U U U U U U F F F F F F                        0 / / / / / ' ' ' ' ' ' # # # # # # / / / / / / / / T T T T T T T T & & & & & & ) ) ) ) ) ) 5 5 5 5 5 5+NNNN + + +	**GGG+ #8:M"M i M M M #== Y    "68J!J Y J J JGC+,,,GC2333GC*+++ "# !! i    
X		Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
s   9B> >CC