
     `i\A                         d Z ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ  e            rddlZ ej        e          Z G d
 de          ZdgZdS )z%
Feature extractor class for Whisper
    )OptionalUnionN   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                       e Zd ZdZdgZ	 	 	 	 	 	 	 	 d# fd
	Zdej        dedej        fdZ	d$dej        dedej        fdZ
e	 d%deej                 deej                 dedeej                 fd            Z	 	 	 	 	 	 	 	 	 	 d&deej        ee         eej                 eee                  f         dedee         deeeef                  dee         dee         dee         dee         d ee         dee         d!ee         defd"Z xZS )'WhisperFeatureExtractora  
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
    input_featuresP   >                  Fc	           	           t                      j        d||||d|	 || _        || _        || _        ||z  | _        | j        |z  | _        || _        || _        t          d|dz  z   |dd|dd          | _
        d S )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask      r   g     @@slaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   ditherr   mel_filters)selfr   r   r)   r*   r(   r   r-   r   kwargs	__class__s             /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr'   z WhisperFeatureExtractor.__init__H   s     	 	
%''"7		
 	

 	
 	
 	
 
$(%5!^z9** 5A:~( '
 
 
    waveform_batchdevicereturnc                    |dk    rt          d| d          g }|D ]}t          |t          | j        d          | j        | j        d| j        | j        d          }|dddd	f         }t          j        ||	                                d
z
            }|dz   dz  }|
                    |           t          j        |          }|S )z
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/hanng       @log10)frame_lengthr)   powerr-   r.   log_melN       @      @)
ValueErrorr   r	   r(   r)   r-   r.   npmaximummaxappendarray)r/   r4   r5   log_spec_batchwaveformlog_specs         r2   _np_extract_fbank_featuresz2WhisperFeatureExtractor._np_extract_fbank_featuresl   s    
 U??qv q q q  
 & 	, 	,H"
F33!Z?{ ,	 	 	H  3B3'Hz(HLLNNS,@AAH 3#-H!!(++++.11r3   r8   rH   c                 $   t          j        |                              |t           j                  }t          j        | j        |          }| j        dk    r1|| j        t          j        |j        |j	        |j
                  z  z  }t          j        || j        | j        |d          }|dddf                                         d	z  }t          j        | j                                      |t           j                  }|j        |z  }t          j        |d
                                          }|                                d	k    rQ|                    d	d          d                             dd          d         }	t          j        ||	dz
            }n*t          j        ||                                dz
            }|dz   dz  }|dk    r&|                                                                }|                                S )z
        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
        yielding results similar to cpu computing with 1e-5 tolerance.
        )r5   r   )dtyper5   T)windowreturn_complex.Nr>   r   g|=)min)dimkeepdimr   r   r?   r@   r8   )torch
from_numpytofloat32hann_windowr(   r-   randnshaperL   r5   stftr)   absr.   Tclampr:   rP   rD   rC   detachr8   numpy)
r/   rH   r5   rM   rY   
magnitudesr.   mel_specrI   max_vals
             r2   _torch_extract_fbank_featuresz5WhisperFeatureExtractor._torch_extract_fbank_features   s   
 #H--00GG"4:f===
 ;#ek(._g_n&o&o&oooHz(DJ_cddd#ss(^''))Q.
&t'788;;FEMRR=:-;xU33399;;<<>>Qllq$l77:>>1d>SSTUVG}Xw}==HH}Xx||~~/CDDHsNc)U??((,,..H~~r3   input_valuesattention_maskr   c                    |t          j        |t           j                  }g }t          | |                    d                    D ]\  }}||d|                                         z
  t          j        |d|                                         dz             z  }||j        d         k     r|||d<   |	                    |           nd | D             }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        Nr>   Hz>r   c                     g | ]C}||                                 z
  t          j        |                                d z             z  DS )rf   )meanrB   sqrtvar).0xs     r2   
<listcomp>zCWhisperFeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>   s@    "b"b"bPQALBGAEEGGdN4K4K#K"b"b"br3   )
rB   rF   int32zipsumrh   ri   rj   rX   rE   )rc   rd   r   normed_input_valuesvectorlengthnormed_slices          r2   zero_mean_unit_var_normz/WhisperFeatureExtractor.zero_mean_unit_var_norm   s     %Xnbh??N"$"%lN4F4Fr4J4J"K"K 9 9 &)=)=)?)? ?276RYSYRY?K^K^K`K`cgKgChChhL.q111,9L)#**<88889 #c"bUa"b"b"b""r3   TN
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizereturn_token_timestampsc                     |<| j         k    r0t          d j        j         d j          d j          d| d	          n(t                              d j        j         d           t          |t          j                  ot          |j
                  d	k    }|r*t          |j
                  d
k    rt          d            |pHt          |t          t          f          o,t          |d         t          j        t          t          f          }|rd |D             }n|s;t          |t          j                  s!t          j        |t          j                  }n^t          |t          j                  rD|j        t          j        t          j                  u r|                    t          j                  }|st          j        |g          j        g}t'          d|i          }                     |||r|n j        |||p|	          }|	rK                     |d         |d          j                  |d<   t          j        |d         d          |d<   |                    d                              d
dd	          }t7                      r j        n j        } ||d         |
          }t          |d         t                    rd |D             |d<   n||d<   |rL|d         dddd j        f         }|d         j
        d	          j        z  dk    r|ddddf         }||d<   |9t                              d j        j         d            fd|D             |d<   ||                     |          }|S )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   c                 Z    g | ](}t          j        |gt           j                   j        )S rL   )rB   asarrayrU   r[   )rk   speechs     r2   rm   z4WhisperFeatureExtractor.__call__.<locals>.<listcomp>  s.    \\\v"*fXRZ@@@B\\\r3   r   r   )r{   rv   rx   ry   r   rd   )rd   r   )axisc                 N    g | ]"}t          j        |t           j                   #S r   )rB   r   rU   )rk   features     r2   rm   z4WhisperFeatureExtractor.__call__.<locals>.<listcomp>B  s*    .s.s.sY`rz'/T/T/T.s.s.sr3   r>   z,`return_token_timestamps` is deprecated for z~ and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.c                 >    g | ]}t          |          j        z  S r%   )lenr)   )rk   raw_speech_ir/   s     r2   rm   z4WhisperFeatureExtractor.__call__.<locals>.<listcomp>V  s)    *o*o*oT`3|+<+<+O*o*o*or3   
num_frames)!r   rA   r1   __name__loggerwarning
isinstancerB   ndarrayr   rX   listtupler   rU   rL   float64astyper[   r   padr+   ru   r   stackget	transposer   rb   rJ   r)   warning_onceconvert_to_tensors)r/   rw   rx   ry   rz   r   r{   rv   r   r|   r5   r}   r0   is_batched_numpy
is_batchedbatched_speechpadded_inputsr   extract_fbank_featuresrescaled_attention_masks   `                   r2   __call__z WhisperFeatureExtractor.__call__   s[   H $ 222 W$.Ja W W)-);W W)-);W WFSW W W   3 NN\W[WeWn \ \ \  
 &j"*==[#jFVBWBWZ[B[ 	ZJ$4 5 5 9 9XRVXXYYY% 
zD%=11lz*Q-RTR\^ceiQj7k7k 	  	7\\Q[\\\JJ 	7Jz2:$F$F 	7JbjAAAJJ
BJ// 	7J4DQSQ[H\H\4\4\#**2:66J  	6*j\2245J%'7&DEE %/CzzT^!1"7"G< ! 
 
  	`.2.J.J./,-=>"0 /K / /M*+
 /1h}EU7V]^._._._M*+ '**+;<<FFq!QOO 3E2F2FkD..DLk 	 0/q0A6JJnQ'.. 	=.s.sdr.s.s.sM*++ /=M*+  		F&34D&EaaaI[I[DOI[F[&\#
 -.4Q7$/IQNN*A!!!SbS&*I'.EM*+". Wt~?V  W  W  W   +p*o*o*odn*o*o*oM,'%)<<^LLMr3   )r   r   r   r   r   r   r   F)r8   )r   )
TNNNrv   NNNr8   N)r   
__module____qualname____doc__model_input_namesr'   rB   r   strrJ   rb   staticmethodr   floatru   r   boolr   intr   r   r   __classcell__)r1   s   @r2   r   r   $   sH        B ** #"
 "
 "
 "
 "
 "
H S UWU_    <   bj  #  Z\Zd        >  be# #2:&#8<RZ8H#Y^#	bj	# # # \#0  ,0;?04!-$('+'+ %26[ ["*d5k4
3CT$u+EVVW[ [ %SM	[
 !sJ!78[  (~[ #[ SM[  }[ tn[ [ "*$[ 
[ [ [ [ [ [ [ [r3   r   )r   typingr   r   r^   rB    r   audio_utilsr   r   r	   !feature_extraction_sequence_utilsr
   feature_extraction_utilsr   utilsr   r   rR   
get_loggerr   r   r   __all__r%   r3   r2   <module>r      s    # " " " " " " "     " " " " " " H H H H H H H H H H I I I I I I 4 4 4 4 4 4 ( ( ( ( ( ( ( (  LLL		H	%	%w w w w w6 w w wt	 %
%r3   