
     `i*                         d Z ddlmZmZ ddlZddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZ  ej        e          Z G d	 d
e          Zd
gZdS )z"
Feature extractor class for CLVP
    )OptionalUnionN   )mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                   N    e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 d fd	Zdej        dej        fdZ	 	 	 	 	 	 	 dde	ej        e
e         e
ej                 e
e
e                  f         dee         dedee         dee	eef                  dee         dee         dee         defdZ xZS )ClvpFeatureExtractora!  
    Constructs a CLVP feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
    Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 22050):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        default_audio_length (`int`, *optional*, defaults to 6):
            The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
            automatically be set to default_audio_length * `self.sampling_rate`.
        hop_length (`int`, *optional*, defaults to 256):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 1024):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        mel_norms (`list` of length `feature_size`, *optional*):
            If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
            mel-filter.
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether to return the attention mask. If left to the default, it will return the attention mask.

            [What are attention masks?](../glossary#attention-mask)
    input_featuresattention_maskP   "V                      NFc
           	          t                      j        d	||||	d|
 || _        || _        || _        ||z  | _        | j        |z  | _        || _        || _        || _	        t          d|dz  z   |dd|dd          | _        d S )
N)feature_sizesampling_ratepadding_valuereturn_attention_mask      r   g     @@slaneyhtk)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   default_audio_length	mel_normsr   mel_filters)selfr   r   r/   r+   r,   r*   r   r0   r   kwargs	__class__s              /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/clvp/feature_extraction_clvp.pyr)   zClvpFeatureExtractor.__init__G   s     	 	
%''"7		
 	

 	
 	
 	
 
$(%5!^z9*$8!"* EQJ/( '
 
 
    waveformreturnc           	      "   t          |t          | j        d          | j        | j        d| j        d          }t          j        t          j        |dd                    }| j        &|t          j	        | j                  dddf         z  }|S )z
        This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
        each mel-filterbank, if `mel_norms` is provided.
        hanng       @N)frame_lengthr+   powerr1   log_melgh㈵>)a_mina_max)
r   r   r*   r+   r1   nplogclipr0   array)r2   r7   log_specs      r5   _np_extract_fbank_featuresz/ClvpFeatureExtractor._np_extract_fbank_featuresm   s    
 DJ//(
 
 
 6"'($dCCCDD>%"(4>":":111d7"CCHr6   T
max_length
raw_speechr   
truncationpad_to_multiple_ofreturn_tensorsr   paddingc	                 4    |<| j         k    r0t          d j        j         d j          d j          d| d	          n(t                              d j        j         d           t          |t          j                  ot          |j
                  d	k    }
|
r*t          |j
                  d
k    rt          d            |
pHt          |t          t          f          o,t          |d         t          j        t          t          f          }|rd |D             }n|s;t          |t          j                  s!t          j        |t          j                  }n^t          |t          j                  rD|j        t          j        t          j                  u r|                    t          j                  }|st          j        |g          j        g}t'          d|i          }| j         j         z  n|}                     ||||||          }|                    d                              d
dd	          } fd|d         D             }t          |d         t                    rd |D             |d<   n||d<   |                    |          S )a	  
        `ClvpFeatureExtractor` is used to extract various voice specific properties such as the pitch and tone of the
        voice, speaking speed, and even speaking defects like a lisp or stuttering from a sample voice or `raw_speech`.

        First the voice is padded or truncated in a way such that it becomes a waveform of `self.default_audio_length`
        seconds long and then the log-mel spectrogram is extracted from it.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*, defaults to `True`):
                Whether to return the attention mask. If left to the default, it will return the attention mask.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            max_length (`int`, *optional*):
                The maximum input length of the inputs.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   c                 Z    g | ](}t          j        |gt           j                   j        )S )dtype)r@   asarrayfloat32T).0speechs     r5   
<listcomp>z1ClvpFeatureExtractor.__call__.<locals>.<listcomp>   s.    \\\v"*fXRZ@@@B\\\r6   rO   r   )rK   rF   rH   rI   r   c                 t    g | ]4}                     |                              t          j                  5S r'   )rE   astyper@   rR   )rT   r7   r2   s     r5   rV   z1ClvpFeatureExtractor.__call__.<locals>.<listcomp>   sD     
 
 
MUD++H55<<RZHH
 
 
r6   c                 6    g | ]}t          j        |          S r'   )r@   rQ   )rT   features     r5   rV   z1ClvpFeatureExtractor.__call__.<locals>.<listcomp>   s"    .a.a.awrz'/B/B.a.a.ar6   )r   
ValueErrorr4   __name__loggerwarning
isinstancer@   ndarraylenshapelisttuplerQ   rR   rP   float64rX   rS   r
   r/   padget	transposeconvert_to_tensors)r2   rG   r   rH   rI   rJ   r   rK   rF   r3   is_batched_numpy
is_batchedbatched_speechpadded_inputsr   s   `              r5   __call__zClvpFeatureExtractor.__call__   s   f $ 222 W$.Ja W W)-);W W)-);W WFSW W W   3 NN\W[WeWn \ \ \  
 &j"*==[#jFVBWBWZ[B[ 	ZJ$4 5 5 9 9XRVXXYYY% 
zD%=11lz*Q-RTR\^ceiQj7k7k 	  	7\\Q[\\\JJ 	7Jz2:$F$F 	7JbjAAAJJ
BJ// 	7J4DQSQ[H\H\4\4\#**2:66J  	6*j\2245J%'7&DEEGQGYT.1CCC_i
!!1"7 ! 
 
 '**+;<<FFq!QOO
 
 
 
YghiYj
 
 
 nQ'.. 	=.a.aR`.a.a.aM*++.<M*+//???r6   )	r   r   r   r   r   r   r   NF)NTNNTrF   N)r\   
__module____qualname____doc__model_input_namesr)   r@   r`   rE   r   rc   floatr   intboolstrr   r
   rn   __classcell__)r4   s   @r5   r   r   !   s       ! !F *+;< #$
 $
 $
 $
 $
 $
L2: "*    2 (,,0;?04!-$(k@ k@"*d5k4
3CT$u+EVVWk@  }k@ 	k@
 %SMk@ !sJ!78k@  (~k@ #k@ SMk@ 
k@ k@ k@ k@ k@ k@ k@ k@r6   r   )rq   typingr   r   numpyr@   audio_utilsr   r   r   !feature_extraction_sequence_utilsr	   feature_extraction_utilsr
   utilsr   r   
get_loggerr\   r]   r   __all__r'   r6   r5   <module>r      s      # " " " " " " "     H H H H H H H H H H I I I I I I 4 4 4 4 4 4 ( ( ( ( ( ( ( ( 
	H	%	%M@ M@ M@ M@ M@3 M@ M@ M@` "
"r6   