
     `iE                         d Z ddlZddlmZmZmZ ddlZddlm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZmZ  ej        e          Z G d	 d
e          Zd
gZdS )z%Feature extractor class for SpeechT5.    N)AnyOptionalUnion   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                       e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d1dedededededededededededededef fdZ	e
	 d2deej                 deej                 ded eej                 fd!            Zd"ej        d ej        fd#Z	 	 	 	 	 	 	 	 	 d3d%eeej        ee         eej                 eee                  f                  d&eeej        ee         eej                 eee                  f                  d'eeeef         d(ee         d)ed*ee         dee         d+eeeef                  dee         d efd,Z	 	 	 	 	 	 	 d4d-eej        ee         eej                 eee                  f         d.ed'eeeef         d(ee         d)ed*ee         dee         d+eeeef                  d efd/Zd eeef         f fd0Z xZS )5SpeechT5FeatureExtractora
  
    Constructs a SpeechT5 feature extractor.

    This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
    the SpeechT5 speech encoder prenet.

    This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
    prenet.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 80):
            The number of mel-frequency bins in the extracted spectrogram features.
        hop_length (`int`, *optional*, defaults to 16):
            Number of ms between windows. Otherwise referred to as "shift" in many papers.
        win_length (`int`, *optional*, defaults to 64):
            Number of ms per window.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        frame_signal_scale (`float`, *optional*, defaults to 1.0):
            Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
        fmin (`float`, *optional*, defaults to 80):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*, defaults to 7600):
            Maximum mel frequency in Hz.
        mel_floor (`float`, *optional*, defaults to 1e-10):
            Minimum value of mel frequency banks.
        reduction_factor (`int`, *optional*, defaults to 2):
            Spectrogram length reduction factor. This argument is deprecated.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
    input_valuesattention_mask   >          FP      @   hann_window      ?  绽|=   Tfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionframe_signal_scalefminfmax	mel_floorreduction_factorreturn_attention_maskc           	          t                      j        d|||d| || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        ||z  dz  | _        ||z  dz  | _        t          | j                  | _        | j        dz  dz   | _        t%          | j        | j        d          | _        t)          | j        | j        | j	        | j
        | j        dd          | _        |	d	k    rt/          j        d
t2                     |dk    rt/          j        dt2                     d S d S )N)r   r    r!   i  r   r   T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr    norm	mel_scaler   zeThe argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformersg       @zcThe argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers )super__init__r"   r,   r#   r$   r%   r&   r'   r(   r)   r*   r+   sample_sizesample_strider   n_fftn_freqsr
   windowr   r    mel_filterswarningswarnFutureWarning)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   kwargs	__class__s                   /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/speecht5/feature_extraction_speecht5.pyr:   z!SpeechT5FeatureExtractor.__init__N   s}   $ 	wl-_lwwpvwww(%:"($$("4		" 0%5='-74?'(899

a1,%D4D4K\gklll*#| -)),
 
 
 $$Mw   s""Mu     #"    returnc                    |t          j        |t           j                  }g }t          | |                    d                    D ]\  }}||d|                                         z
  t          j        |d|                                         dz             z  }||j        d         k     r|||d<   |	                    |           nd | D             }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        NHz>r   c                     g | ]C}||                                 z
  t          j        |                                d z             z  DS )rL   )meannpsqrtvar).0xs     rG   
<listcomp>zDSpeechT5FeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>   s@    "b"b"bPQALBGAEEGGdN4K4K#K"b"b"brH   )
rO   arrayint32zipsumrN   rP   rQ   shapeappend)r   r   r!   normed_input_valuesvectorlengthnormed_slices          rG   zero_mean_unit_var_normz0SpeechT5FeatureExtractor.zero_mean_unit_var_norm   s     %Xnbh??N"$"%lN4F4Fr4J4J"K"K 9 9 &)=)=)?)? ?276RYSYRY?K^K^K`K`cgKgChChhL.q111,9L)#**<88889 #c"bUa"b"b"b""rH   one_waveformc           
      z    t          || j        | j        | j        | j        | j        | j        d          }|j        S )zZ
        Extracts log-mel filterbank features for one waveform array (unbatched).
        log10)r?   frame_lengthr$   
fft_lengthr@   r*   log_mel)r	   r?   r;   r<   r=   r@   r*   T)rD   r`   log_mel_specs      rG   _extract_mel_featuresz.SpeechT5FeatureExtractor._extract_mel_features   sI     #;))z(n	
 	
 	
 ~rH   Naudioaudio_targetpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc
                    ||t          d          |	2|	| j        k    r&t          d|  d| j         d| j         d|	 d	          n(t                              d| j        j         d	           | | j        |d
||||||fi |
}nd}|@ | j        |d||||||fi |
}||S |d         |d<   |                    d          }|||d<   |S )aA  
        Main method to featurize and prepare for the model one or several sequence(s).

        Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
        spectrogram features.

        Args:
            audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
                be mono channel audio, not stereo, i.e. single float per timestep.
            audio_target (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
                The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
                list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
                spectrogram features.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
                to pass `sampling_rate` at the forward call to prevent silent errors.
        Nz9You must provide either `audio` or `audio_target` values.z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.FTr   labelsr   decoder_attention_mask)
ValueErrorr    loggerwarningrF   __name___process_audioget)rD   ri   rj   rk   rl   rm   rn   r,   ro   r    rE   inputsinputs_targetrs   s                 rG   __call__z!SpeechT5FeatureExtractor.__call__   s   ~ =\1XYYY$ 222 F$ F F*F F*F F5BF F F   3 NN\W[WeWn \ \ \  
 (T("%
 
 
 
FF F#/D/"%
 
 
 
M ~$$#0#@x )6):):;K)L)L&)57MF34rH   speech	is_targetc	           	          t          |t          j                  ot          |j                  dk    }
|
r*t          |j                  dk    rt          d            |
pHt          |t          t          f          o,t          |d         t          j        t          t          f          }|rd |D             }n|s;t          |t          j                  s!t          j        |t          j	                  }n^t          |t          j                  rD|j
        t          j
        t          j                  u r|                    t          j	                  }|s|g} j        }|r, fd|D             }t          d|i          } j         _        nt          d|i          }  j        |f|||||d	|	}| _        |d         }t          |d         t          j                  sd
 |D             |d<   nt          |t          j                  s[t          |d         t          j                  r;|d         j
        t          j
        t          j                  u rd |D             |d<   nat          |t          j                  rG|j
        t          j
        t          j                  u r"|                    t          j	                  |d<   |                    d          }|d |D             |d<   |sT j        rM                     ||          t(          j        ur|nd }                     |d         | j                  |d<   ||                    |          }|S )Nr   r   z2Only mono-channel audio is supported for input to r   c                 N    g | ]"}t          j        |t           j                   #S dtyperO   asarrayfloat32)rR   r}   s     rG   rT   z;SpeechT5FeatureExtractor._process_audio.<locals>.<listcomp>=  s)    PPPvbjrz:::PPPrH   r   c                 :    g | ]}                     |          S r8   )rh   )rR   waveformrD   s     rG   rT   z;SpeechT5FeatureExtractor._process_audio.<locals>.<listcomp>L  s'    TTT228<<TTTrH   r   )rk   rl   rm   rn   r,   c                 N    g | ]"}t          j        |t           j                   #S r   r   rR   rU   s     rG   rT   z;SpeechT5FeatureExtractor._process_audio.<locals>.<listcomp>a  s*    ,k,k,kUZRZRZ-P-P-P,k,k,krH   c                 L    g | ]!}|                     t          j                  "S r8   )astyperO   r   r   s     rG   rT   z;SpeechT5FeatureExtractor._process_audio.<locals>.<listcomp>g  s&    ,`,`,`%U\\"*-E-E,`,`,`rH   r   c                 N    g | ]"}t          j        |t           j                   #S r   )rO   r   rV   r   s     rG   rT   z;SpeechT5FeatureExtractor._process_audio.<locals>.<listcomp>n  s*    .m.m.mUZrz%rx/P/P/P.m.m.mrH   )rl   )r   r!   )
isinstancerO   ndarraylenrY   rt   listtupler   r   r   float64r   r   r   r#   padry   r"   _get_padding_strategiesr   
DO_NOT_PADr_   r!   convert_to_tensors)rD   r}   r~   rk   rl   rm   rn   r,   ro   rE   is_batched_numpy
is_batchedfeature_size_hackfeaturesencoded_inputspadded_inputsr   r   s   `                 rG   rx   z'SpeechT5FeatureExtractor._process_audio)  s    &fbj99Sc&,>O>ORS>S 	ZFL 1 1A 5 5XRVXXYYY% 
ve}--d:fQi"*V[]aIb3c3c 	  	/PPPPPFF 	/Jvrz$B$B 	/Zbj999FF
++ 	/@T@T0T0T]]2:..F  	XF !-  	DTTTTVTTTH)>8*DEEN $ 1D)>6*BCCN 
!!1"7
 
 
 
 . %^4,q/2:66 		L,k,k^j,k,k,kM.))<44	L<?BJ77	L Q%"*)=)===,`,`S_,`,`,`M.))bj11 	Ll6HBHUWU_L`L`6`6`,8,?,?
,K,KM.) '**+;<<%.m.m^l.m.m.mM*+  	T. 	 //J/OOWfWqqq  
 -1,H,Hn-n\`\n -I - -M.) %)<<^LLMrH   c                 h    t                                                      }g d}|D ]	}||v r||= 
|S )N)r?   r@   r;   r<   r=   r>   )r9   to_dict)rD   outputnamesr/   rF   s       rG   r   z SpeechT5FeatureExtractor.to_dict  sI    "" ^]] 	! 	!Dv~~4LrH   )r   r   r   Fr   r   r   r   r   r   r   r   r   T)r   )	NNFNFNNNN)FFNFNNN)rw   
__module____qualname____doc__model_input_namesintfloatboolstrr:   staticmethodr   rO   r   r_   rh   r   r   r   r   r   r|   rx   dictr   r   __classcell__)rF   s   @rG   r   r      s       * *X ()9: """)$'  !&*: :: : 	:
 : : : : : ": : : : :  $: : : : : :x  be# #2:&#8<RZ8H#Y^#	bj	# # # \#*j 
   * `dfj5:$( ,004;?'+s sbj$u+tBJ7GdSXkIZZ[\s uRZed2:>NPTUYZ_U`Pa%abcs tS/12	s
 SMs s %SMs  (~s !sJ!78s  }s 
s s s sp  5:$( ,004;?U Ubj$u+tBJ/?d5kARRSU U tS/12	U
 SMU U %SMU  (~U !sJ!78U 
U U U Un	c3h 	 	 	 	 	 	 	 	 	 	rH   r   )r   rA   typingr   r   r   numpyrO   audio_utilsr   r   r	   r
   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerrw   ru   r   __all__r8   rH   rG   <module>r      s    , +  ' ' ' ' ' ' ' ' ' '     \ \ \ \ \ \ \ \ \ \ \ \ I I I I I I 4 4 4 4 4 4 9 9 9 9 9 9 9 9 9 9 
	H	%	%j j j j j7 j j jZ &
&rH   