
     `i[3                         d dl mZmZ d dlZd dlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ  e            rd dlZdZd	Z ej        e          Z ed
           G d de                      ZdgZdS )    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)
TensorTypeis_librosa_availablelogging)requiresgh㈵>g      p>)torchlibrosa)backendsc                   d    e Zd ZdZddgZ	 	 	 	 	 	 	 d fd	ZddZ	 	 	 	 	 	 	 	 	 	 d deej	        e
e         e
ej	                 e
e
e                  f         dedee         deeeef                  dee         dee         dee         dee         dee         dee         dee         defdZ xZS )!ParakeetFeatureExtractora  
    Constructs a Parakeet feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        n_fft (`int`, *optional*, defaults to 512):
            Size of the Fourier transform.
        win_length (`int`, *optional*, defaults to 400):
            The window length for the STFT computation.
        preemphasis (`float`, *optional*, defaults to 0.97):
            A preemphasis filter coefficient. 0.0 means no preemphasis filter.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
    input_featuresattention_maskP   >          
ףp=
?        c                 6    t                      j        d|||d| || _        || _        || _        || _        t          j                            |||d|dz  d          }	t          j
        |	                              t          j                  | _        d S )N)feature_sizesampling_ratepadding_valuer      slaney)srn_fftn_melsfminfmaxnorm )super__init__
hop_lengthr!   
win_lengthpreemphasisr   filtersmelr   
from_numpytofloat32mel_filters)selfr   r   r)   r!   r*   r+   r   kwargsr1   	__class__s             /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/parakeet/feature_extraction_parakeet.pyr(   z!ParakeetFeatureExtractor.__init__C   s     	wl-_lwwpvwww$
$& o))E,S}_`O`go * 
 
 !+K88;;EMJJ    cpuc           	         t          j        | j        d|          }t          j        || j        | j        | j        |dd          }t          j        |          }t          j        |                    d          	                    d                    }|                    d          }| j
                            |          }||z  }t          j        |t          z             }|                    ddd	          }|S )
NF)periodicdeviceTconstant)r)   r*   windowreturn_complexpad_moder   r      )r   hann_windowr*   stftr!   r)   view_as_realsqrtpowsumr1   r/   logLOG_ZERO_GUARD_VALUEpermute)r2   waveformr:   r<   rB   
magnitudesr1   mel_specs           r5   _torch_extract_fbank_featuresz6ParakeetFeatureExtractor._torch_extract_fbank_featurese   s    "4?U6RRRzJ
 
 
 '--
Z
q 1 1 5 5b 9 9::
^^A&&
 &))&11+9X(<<== ##Aq!,,r6   FNlongest
raw_speech
truncationpad_to_multiple_ofreturn_tensorsreturn_attention_maskpadding
max_lengthr   do_normalizer:   return_token_timestampsreturnc                 	   |<|| j         k    r0t          d| j        j         d| j          d| j          d| d	          n(t                              d| j        j         d           t          |t          j                  rt          j
        |          }nHt          |t          t          f          r,t          |d	         t          j                  rd
 |D             }t          |t          j                  ot          |j                  dk    }|rUt          |j                  dk    r=t                              d| j        j         d           |                    d          }t          |t          t          f          }|rZ|D ]W}t          |j                  dk    r=t                              d| j        j         d           |                    d          }X|s|rd |D             }n*|dddf                             t          j                  g}d |D             }t'          ||d          }|                     |||||d          }|j                            d          }| j        t          j        |j        d         |j                                      d	          |j                            d          k     }t          j        |ddddf         |ddddf         | j        |ddddf         z  z
  gd          }|                    | d          }|                     ||
          }t          j        |j        | j         dz  dz  z   | j         z
  | j!                  }t          j        |j        d         |
          dddf         |dddf         k     }|                    d          }||z  }|"                    d          |                    d          z  }|                    d          }||z
  dz  |z  "                    d          |dz
                      d          z  }t          j#        |                              d          }||z
  |tH          z   z  }||z  }t'          ||d|          S )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   c                 6    g | ]}t          j        |          S r&   )r   tensor.0speechs     r5   
<listcomp>z5ParakeetFeatureExtractor.__call__.<locals>.<listcomp>   s"    HHH6%,v..HHHr6   r@   r   z2Only mono-channel audio is supported for input to z;. We will take the mean of the channels to convert to mono.r?   c                 `    g | ]+}|d d d f                              t          j                  ,S )N)r/   r   r0   r]   s     r5   r`   z5ParakeetFeatureExtractor.__call__.<locals>.<listcomp>   s4    UUU&D/,,U];;UUUr6   c                 ,    g | ]}t          |          S r&   )lenr]   s     r5   r`   z5ParakeetFeatureExtractor.__call__.<locals>.<listcomp>   s    >>>V>>>r6   )r   audio_lengthspt)rT   rU   rP   rQ   rR   )r:   )dimr   )r   r   )datatensor_type)%r   
ValueErrorr4   __name__loggerwarning
isinstancenpndarrayr   r\   listtupleTensorrc   shapemeanr/   r0   r   padr   squeezer+   aranger:   	unsqueezerd   catmasked_fillrM   floor_divider!   r)   rF   rD   EPSILON)r2   rO   rP   rQ   rR   rS   rT   rU   r   rV   r:   rW   r3   is_batched_torchis_batched_sequencer_   rd   batched_speechpadded_inputsr   timemaskfeatures_lengthsr   maskinput_features_maskedrt   variancestds                               r5   __call__z!ParakeetFeatureExtractor.__call__   s\   H $ 222 W$.Ja W W)-);W W)-);W WFSW W W   3 NN\W[WeWn \ \ \   j"*-- 	Ij11JJ
T5M22 	Iz*Q-QSQ[7\7\ 	IHHZHHHJ%j%,??]C
HXDYDY\]D] 	-J$4 5 5 9 9NNLT^E\ L L L   $,,J(dE]CC 	-$ - -v|$$q((NNTT^Md T T T   $[[__F 	A2 	AUU*UUUJJ$QQQW-00??@J>>:>>>%Vc&d&dee!!1 ! 
 
 '5==bAA '|N$8$;NDYZZZdd +55a889H #Y2A2&qqq!""u(=@PSabcbcbcehfhehbhSi@i(ijpq  N ,77	3GGN;;NFSS -'$*/A*==
JDO
 
 n&:1&=fMMMdTUTUTUgVYijkjkjkmqjqYrr ''++ . 5$((Q(//2B2L2LR2P2PP~~a  *T1a7$>CCCJJN^abNbMmMmnpMqMqqj"",,Q//(4/C'MB$"0"0  '
 
 
 	
r6   )r   r   r   r   r   r   r   )r7   )
FNNNrN   NNNr7   N)rj   
__module____qualname____doc__model_input_namesr(   rM   r   rn   ro   rp   floatboolr   intstrr   r   r   __classcell__)r4   s   @r5   r   r   %   s        4 *+;<  K  K  K  K  K  KD   > !,0;?04!*$('+'+ %26[
 [
"*d5k4
3CT$u+EVVW[
 [
 %SM	[

 !sJ!78[
  (~[
 #[
 SM[
  }[
 tn[
 [
 "*$[
 
[
 [
 [
 [
 [
 [
 [
 [
r6   r   )typingr   r   numpyrn   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   utils.import_utilsr   r   r|   rH   
get_loggerrj   rk   r   __all__r&   r6   r5   <module>r      s"   # " " " " " " "      I I I I I I 4 4 4 4 4 4 > > > > > > > > > > * * * * * *  NNN   
	H	%	% 
'(((v
 v
 v
 v
 v
7 v
 v
 )(v
r &
&r6   