o
     ei6                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	 d dl
Zd dlmZ eG dd dZ		ddejd	ee d
edee fddZdedfdejdee d
edede	eej eeeef  f f
ddZG dd dZejdd ZG dd dZdS )    N)	dataclass)DictListOptionalTuple)get_assets_pathc                   @   sv   e Zd ZU dZdZeed< dZeed< dZe	ed< edZ
eed	< d
Ze	ed< dZe	ed< dZe	ed< dZeed< dS )
VadOptionsa  VAD options.

    Attributes:
      threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
      neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
        than neg_threshold, it is always considered silence. Values higher than neg_threshold
        are only considered speech if the previous sample was classified as speech; otherwise,
        they are treated as silence. This parameter helps refine the detection of speech
         transitions, ensuring smoother segment boundaries.
      min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
      max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
        than max_speech_duration_s will be split at the timestamp of the last silence that
        lasts more than min_silence_at_max_speech (if any), to prevent aggressive cutting.
        Otherwise, they will be split aggressively just before max_speech_duration_s.
      min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
        before separating it
      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
      min_silence_at_max_speech: Minimum silence duration in ms which is used to avoid abrupt cuts
          when max_speech_duration_s is reached.
      use_max_poss_sil_at_max_speech: Whether to use the maximum possible silence at
          max_speech_duration_s or not. If not, the last silence is used.
    g      ?	thresholdNneg_thresholdr   min_speech_duration_msinfmax_speech_duration_si  min_silence_duration_msi  speech_pad_msb   min_silence_at_max_speechTuse_max_poss_sil_at_max_speech)__name__
__module____qualname____doc__r	   float__annotations__r
   r   intr   r   r   r   r   bool r   r   2/home/jaya/work/projects/WHISPER/faster_asr/vad.pyr      s   
 r   >  audiovad_optionssampling_ratereturnc           %      K   s   |du rt di |}|j}|j}|j}|j}|j}d}	|j}
|j}|j}|| d }||
 d }|| |	 d|  }|| d }|| d }t	| }t
 }t| d|	| jd |	  f}||}d}g }i }g }|du rtt|d d}d}d }}t|D ]\}}|	| }||kr|r|| } | |kr||| f d}||k r|}||kr|sd	}||d
< q~|r(||d
  |kr(|r|rt|dd d\}}!||d< || i }||! }||| k r||d
< nd}d } }}g }n9|r||d< || i }||k rd}n||d
< d } }}g }n||d< || i }d } }}d}g }q~||k rj|rj|s5|}|| }"|sC|"|krC|}|"|k rIq~||d< |d |d
  |kr]|| i }d } }}d}g }q~q~|r||d
  |kr||d< || t|D ]\}}#|dkrttd|#d
 | |#d
< |t	|d kr||d  d
 |#d  }$|$d| k r|#d  t|$d 7  < ttd||d  d
 |$d  ||d  d
< qtt||#d | |#d< ttd||d  d
 | ||d  d
< qtt||#d | |#d< q|S )a  This method is used for splitting long audios into speech chunks using silero VAD.

    Args:
      audio: One dimensional float array.
      vad_options: Options for VAD processing.
      sampling rate: Sampling rate of the audio.
      kwargs: VAD options passed as keyword arguments for backward compatibility.

    Returns:
      List of dicts containing begin and end samples of each speech chunk.
    N   i     r   Fg333333?g{Gz?Tstartc                 S   s   | d S N   r   )xr   r   r   <lambda>   s    z'get_speech_timestamps.<locals>.<lambda>)keyendr&   r   )r   r	   r
   r   r   r   r   r   r   lenget_vad_modelnppadshapemax	enumerateappendr   min)%r   r   r    kwargsr	   r
   r   r   r   Zwindow_size_samplesr   r   r   Zmin_speech_samplesZspeech_pad_samplesZmax_speech_samplesZmin_silence_samplesZ!min_silence_samples_at_max_speechZaudio_length_samplesmodelZpadded_audioZspeech_probsZ	triggeredZspeechesZcurrent_speechZpossible_endsZtemp_endZprev_endZ
next_startiZspeech_probZ
cur_sampleZsil_durZdurZsil_dur_nowZspeechZsilence_durationr   r   r   get_speech_timestamps3   s   









r7   r   chunksmax_durationc                 C   s8  |sddg d}t jg t jdg|gfS g }g }g }d}d}	t jg t jd}
|D ][}||d  |d  || kre||
 |	| || |d}|	|7 }	|| g }| |d |d  }
|d |d  }q(|| t |
| |d |d  f}
||d |d  7 }q(||
 |	| || |d}|| ||fS )zPThis function merges the chunks of audio into chunks of max_duration (s) length.r   )offsetdurationsegmentsdtyper*   r$   )r-   arrayfloat32r2   concatenate)r   r8   r    r9   chunk_metadataaudio_chunkschunks_metadatacurrent_segmentsZcurrent_durationZtotal_durationZcurrent_audiochunkr   r   r   collect_chunks   sL   




rG   c                	   @   sh   e Zd ZdZddee dedefddZ			dd
ede	e de
defddZdd
ede
defddZdS )SpeechTimestampsMapz3Helper class to restore original speech timestamps.r#   r8   r    time_precisionc                 C   sj   || _ || _g | _g | _d}d}|D ] }||d | 7 }|d }| j|d |  | j||  qd S )Nr   r$   r*   )r    rI   chunk_end_sampletotal_silence_beforer2   )selfr8   r    rI   Zprevious_endZsilent_samplesrF   r   r   r   __init__  s   zSpeechTimestampsMap.__init__NFtimechunk_indexis_endr!   c                 C   s.   |d u r
|  ||}| j| }t|| | jS )N)get_chunk_indexrK   roundrI   )rL   rN   rO   rP   rK   r   r   r   get_original_time+  s   
z%SpeechTimestampsMap.get_original_timec                 C   sF   t || j }|| jv r|r| j|S tt| j|t| jd S r%   )r   r    rJ   indexr3   bisectr+   )rL   rN   rP   sampler   r   r   rQ   7  s   z#SpeechTimestampsMap.get_chunk_index)r#   )NF)F)r   r   r   r   r   dictr   rM   r   r   r   rS   rQ   r   r   r   r   rH     s    
rH   c                  C   s   t jt d} t| S )zReturns the VAD model instance.zsilero_vad_v6.onnx)ospathjoinr   SileroVADModel)rY   r   r   r   r,   B  s   r,   c                   @   s0   e Zd Zdd Z	ddejdedefdd	Zd
S )r[   c              
   C   sh   zdd l }W n ty } ztd|d }~ww | }d|_d|_d|_d|_|j|dg|d| _	d S )Nr   z8Applying the VAD filter requires the onnxruntime packager&   F   ZCPUExecutionProvider)Z	providersZsess_options)
onnxruntimeImportErrorRuntimeErrorZSessionOptionsZinter_op_num_threadsZintra_op_num_threadsZenable_cpu_mem_arenaZlog_severity_levelZInferenceSessionsession)rL   rY   r]   eoptsr   r   r   rM   J  s(   zSileroVADModel.__init__r"   @   r   num_samplescontext_size_samplesc                 C   s  |j dks	J d|jd | dksJ dtjddd}tjddd}tjd|fdd}|d|}|d	| d f }d|d< t|dd}t||gd}|d|| }d
}|jd }	g }
td|	|D ]}| j	d ||||  ||d\}}}|

| qftj|
dd}|S )Nr&   zInput should be a 1D arrayr   z.Input size should be a multiple of num_samples)r&   r&      r@   r=   .i'  )inputhc)axis)ndimr/   r-   zerosreshaperollrA   ranger`   runr2   )rL   r   rd   re   ri   rj   contextZbatched_audioZencoder_batch_sizeZnum_segmentsoutputsr6   outputoutr   r   r   __call__^  s6   

zSileroVADModel.__call__N)r"   rc   )r   r   r   rM   r-   ndarrayr   rv   r   r   r   r   r[   I  s    r[   )Nr   )rU   	functoolsrX   dataclassesr   typingr   r   r   r   numpyr-   faster_asr.utilsr   r   rw   r   rW   r7   r   strrG   rH   	lru_cacher,   r[   r   r   r   r   <module>   sL    &
 -
<*
