
     `i`Y                         d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ  ej        e          Z G d	 d
e          Zd
gZdS )z)Feature extractor class for UnivNetModel.    )AnyOptionalUnionN   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc            )           e Zd ZdZg dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d>dedededededededede	e         dedede	e         d ed!ed"ed#ed$ed%ed&ed'ef( fd(Z
d) Zd* Zd+ej        d,ej        fd-Z	 d?d.ed/e	ej        j                 d,ej        fd0Zd?d,eej                 fd1Z	 	 	 	 	 	 	 	 	 	 	 	 d@d2eej        ee         eej                 eee                  f         de	e         d3eeeef         d4e	e         d5ed6e	e         d7ed/e	ej        j                 d8ed9e	e         de	e         d:e	e         d;e	eeef                  d,efd<Zd,eeef         f fd=Z xZS )AUnivNetFeatureExtractora  
    Constructs a UnivNet feature extractor.

    This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
    STFT implementation follows that of TacoTron 2 and Hifi-GAN.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value to pad with when applying the padding strategy defined by the `padding` argument to
            [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
            `__call__` will also use this padding value.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
            performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 100):
            The number of mel-frequency bins in the extracted spectrogram features. This should match
            `UnivNetModel.config.num_mel_bins`.
        hop_length (`int`, *optional*, defaults to 256):
            The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
            that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
            the `hop_length` in ms.
        win_length (`int`, *optional*, defaults to 1024):
            The direct number of samples for each sliding window. Note that this is different from other audio feature
            extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        filter_length (`int`, *optional*, defaults to 1024):
            The number of FFT components to use. If `None`, this is determined using
            `transformers.audio_utils.optimal_fft_length`.
        max_length_s (`int`, *optional*, defaults to 10):
            The maximum input length of the model in seconds. This is used to pad the audio.
        fmin (`float`, *optional*, defaults to 0.0):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*):
            Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
        mel_floor (`float`, *optional*, defaults to 1e-09):
            Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
            different than in [`transformers.audio_utils.spectrogram`].
        center (`bool`, *optional*, defaults to `False`):
            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
            `t` will start at time `t * hop_length`.
        compression_factor (`float`, *optional*, defaults to 1.0):
            The multiplicative compression factor for dynamic range compression during spectral normalization.
        compression_clip_val (`float`, *optional*, defaults to 1e-05):
            The clip value applied to the waveform before applying dynamic range compression during spectral
            normalization.
        normalize_min (`float`, *optional*, defaults to -11.512925148010254):
            The min value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
            The max value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        model_in_channels (`int`, *optional*, defaults to 64):
            The number of input channels to the [`UnivNetModel`] model. This should match
            `UnivNetModel.config.model_in_channels`.
        pad_end_length (`int`, *optional*, defaults to 10):
            If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
            number of appended samples will be `pad_end_length * hop_length`.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
    )input_featuresnoise_sequencepadding_mask   ]          Fd         hann_window
   N&.>      ?h㈵>    '    ă@@   Tfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionfilter_lengthmax_length_sfminfmax	mel_floorcentercompression_factorcompression_clip_valnormalize_minnormalize_maxmodel_in_channelspad_end_lengthc           	          t                      j        d||||d| || _        || _        || _        || _        || _        |	| _        || _        |t          |          dz  }|| _
        || _        |
| _        |
|z  | _        | j        t          | j                  | _        n| j        | _        | j        dz  dz   | _        t#          | j        | j        d          | _        t'          | j        | j        | j        | j
        | j        dd          | _        || _        || _        || _        || _        || _        || _        || _        d S )	N)r#   r$   r%   return_attention_mask   r   T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr$   norm	mel_scale )super__init__r&   r'   r(   r)   r*   r+   r-   floatr.   r/   r,   num_max_samplesr   n_fftn_freqsr
   windowr   r$   mel_filtersr0   r1   r2   r3   r4   r5   r6   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r8   kwargs	__class__s                          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/univnet/feature_extraction_univnet.pyrF   z UnivNetFeatureExtractor.__init__e   s|   2 	 	
%''"7		
 	

 	
 	
 	
 )($$(*	<''!+D	"(+m;%+DO<<DJJ+DJ
a1,%DO$J[fjkkk*#| -)),
 
 
 "4$8!**!2,    c                 B    d|| j         z
  | j        | j         z
  z  z  dz
  S )Nr9   r   r3   r4   rM   r	   s     rP   	normalizez!UnivNetFeatureExtractor.normalize   s+    [4#55$:LtOa:abcfgggrQ   c                 B    | j         | j        | j         z
  |dz   dz  z  z   S )Nr   r9   rS   rT   s     rP   denormalizez#UnivNetFeatureExtractor.denormalize   s-    !T%7$:L%LR]`aRaefQf$gggrQ   waveformreturnc                 T   t          j        |t          | j        | j        z
  dz            t          | j        | j        z
  dz            fd          }t          || j        | j        | j        | j        d| j        dd	  	        }t          j        t          j	        |          dz  t          j
        |          dz  z   | j        z             }t          j        | j        j        |          }t          j        t          j        || j        d          | j        z            }|j        S )a  
        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.

        Args:
            waveform (`np.ndarray` of shape `(length,)`):
                The input waveform. This must be a single real-valued, mono waveform.

        Returns:
            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
        r9   reflect)modeN)rK   frame_lengthr(   
fft_lengthpowerr0   rL   r/   )a_mina_max)nppadintrI   r(   r	   rK   r0   sqrtrealimagr/   matmulrL   Tlogclipr2   r1   )rM   rX   complex_spectrogramamplitude_spectrogrammel_spectrogramlog_mel_spectrograms         rP   rn   z'UnivNetFeatureExtractor.mel_spectrogram   s0    6$*t.!344c4:;W[\:\6]6]^
 
 
 *;z;

 

 

 !#G'((A-8K0L0LPQ0QQTXTbb!
 !
 )D$4$68MNN !fGO4+DDQQQTXTkk
 

 #$$rQ   noise_length	generatorc                     |t           j                                        }|| j        f}|                    |t           j                  }|S )a  
        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
        [`UnivNetModel.forward`].

        Args:
            spectrogram_length (`int`):
                The length (dim 0) of the generated noise.
            model_in_channels (`int`, *optional*, defaults to `None`):
                The number of features (dim 1) of the generated noise. This should correspond to the
                `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
                `self.config.model_in_channels`.
            generator (`numpy.random.Generator`, *optional*, defaults to `None`)
                An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
                new generator with fresh entropy will be created.

        Returns:
            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
            model_in_channels)`.
        Ndtype)rb   randomdefault_rngr5   standard_normalfloat32)rM   rp   rq   noise_shapenoises        rP   generate_noisez&UnivNetFeatureExtractor.generate_noise   sI    0 	--//I#T%;<))+RZ)HHrQ   c                 Z    d |D             }fdt          |          D             }|S )a  
        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
        lengths after removing padding.

        Args:
            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                The batched output waveforms from the [`UnivNetModel`].
            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
                The batched lengths of each waveform before padding.

        Returns:
            `list[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
        c                     g | ]=}|                                                     d d                                          >S )cpuT)devicecopy)detachtonumpy).0rX   s     rP   
<listcomp>z8UnivNetFeatureExtractor.batch_decode.<locals>.<listcomp>  sA    eeexX__&&))T)BBHHJJeeerQ   Nc                 6    g | ]\  }}|d |                  S NrD   )r   irX   waveform_lengthss      rP   r   z8UnivNetFeatureExtractor.batch_decode.<locals>.<listcomp>  s.    aaa[Q"7$4Q$7"78aaarQ   )	enumerate)rM   	waveformsr   s     `rP   batch_decodez$UnivNetFeatureExtractor.batch_decode  sH      fe[deee	'aaaaIV_L`L`aaaIrQ   
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_noisepad_end
pad_lengthr8   return_tensorsc                    
 ||n j         }|<| j        k    r0t          d j        j         d j         d j         d| d	          n(t
                              d j        j         d           t          |t          j	                  ot          |j                  d	k    }|r*t          |j                  d
k    rt          d            |pHt          |t          t          f          o,t          |d         t          j	        t          t          f          }|rd |D             }n|s;t          |t          j	                  s!t          j        |t          j                  }n^t          |t          j	                  rD|j        t          j        t          j                  u r|                    t          j                  }|s!t          j        |t          j                  g}|	r

n j        

 fd|D             }t)          d|i          }                     ||||n j        |||          }|                    d          } fd|D             }t          |d         t                    rd |D             |d<   nd |D             |d<   |                    d          }|d |D             |d<   |r fd|d         D             }||d<   |r fd|d         D             |d<   ||                    |          }|S )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
                padding index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

                If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*, defaults to `True`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_noise (`bool`, *optional*, defaults to `True`):
                Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
                An optional `numpy.random.Generator` random number generator to use when generating noise.
            pad_end (`bool`, *optional*, defaults to `False`):
                Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
                generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
                padding will be done before the padding strategy specified in `padding` is performed.
            pad_length (`int`, *optional*, defaults to `None`):
                If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
                will default to `self.config.pad_end_length`.
            do_normalize (`bool`, *optional*):
                Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
                the performance for some models. If not set, this will default to `self.config.do_normalize`.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.np.array` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r9   z2Only mono-channel audio is supported for input to r   c                 N    g | ]"}t          j        |t           j                   #S rs   rb   asarrayrx   )r   speechs     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  s)    XXX6"*V2:>>>XXXrQ   rs   c                 \    g | ](}t          j        |d j        z  fj                  )S )r   )constant_values)rb   rc   r(   r%   )r   rX   r   rM   s     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  sJ        x!Z$/%A!BTXTfggg  rQ   r   )r   r   r   r   r8   c                 :    g | ]}                     |          S rD   )rn   )r   rX   rM   s     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  s'    ZZZxD00::ZZZrQ   c                 N    g | ]"}t          j        |t           j                   #S r   r   r   mels     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  s*    /n/n/nVY
3bj0Q0Q0Q/n/n/nrQ   c                 L    g | ]!}|                     t          j                  "S rD   )astyperb   rx   r   s     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  s&    /c/c/c3

2:0F0F/c/c/crQ   attention_maskc                 N    g | ]"}t          j        |t           j                   #S r   )rb   r   int32)r   arrays     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  s*    -l-l-lTYbjbh.O.O.O-l-l-lrQ   r   c                 R    g | ]#}                     |j        d                    $S )r   )r{   shape)r   r	   rq   rM   s     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  sA        ##K$5a$8)DD  rQ   r   c                 :    g | ]}                     |          S rD   )rU   )r   r	   rM   s     rP   r   z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>  s2     0 0 00;{++0 0 0rQ   )r&   r$   
ValueErrorrO   __name__loggerwarning
isinstancerb   ndarraylenr   listtupler   rx   rt   float64r   r6   r   rc   rH   getconvert_to_tensors)rM   r   r$   r   r   r   r   r   rq   r   r   r&   r8   r   is_batched_numpy
is_batchedbatched_speechpadded_inputsr   mel_spectrogramsr   rz   s   `       ` `           rP   __call__z UnivNetFeatureExtractor.__call__  s   X (4'?||TEV$ 222 W$.Ja W W)-);W W)-);W WFSW W W   3 NN\W[WeWn \ \ \  
 &j"*==[#jFVBWBWZ[B[ 	ZJ$4 5 5 9 9XRVXXYYY% 
zD%=11lz*Q-RTR\^ceiQj7k7k 	  	7XXZXXXJJ 	7Jz2:$F$F 	7JbjAAAJJ
BJ// 	7J4DQSQ[H\H\4\4\#**2:66J  	D*ZrzBBBCJ  	'1'=4CVJ     *  J
 &'7&DEE%/%;zzAU!1"7 ! 
 
 '**+;<<ZZZZ>ZZZnQ'.. 	d/n/n]m/n/n/nN+,,/c/cRb/c/c/cN+, '**+;<<%-l-l]k-l-l-lN>* 	5    #12B#C  E 05N+, 	0 0 0 0?MN^?_0 0 0N+, %+>>~NNNrQ   c                 h    t                                                      }g d}|D ]	}||v r||= 
|S )N)rK   rL   rI   rJ   rH   )rE   to_dict)rM   outputnamesr;   rO   s       rP   r   zUnivNetFeatureExtractor.to_dict  sI    "" QPP 	! 	!Dv~~4LrQ   )r   r   r   Fr   r   r   r   r   r   r   Nr   Fr   r   r    r!   r"   r   Tr   )NTNTNTNFNNNN)r   
__module____qualname____doc__model_input_namesrd   rG   boolstrr   rF   rU   rW   rb   r   rn   ru   	Generatorr{   r   r   r   r   r   r   r   dictr   r   __classcell__)rO   s   @rP   r   r      s       C CJ MLL """)'+ $$'&*21!# "-J- J-J- J- 	J-
 J- J- J- J- J-  }J- J- J- uoJ- J- J-  "!J-" $#J-$ %J-& 'J-( )J-* +J- J- J- J- J- J-Xh h hh h h.%
 .%rz .% .% .% .%f 48  BI/0 
	   @ RZ@P    4 (,59$(,0!37$(&*04;?_ _"*d5k4
3CT$u+EVVW_  }_ tS/12	_
 SM_ _ %SM_ _ BI/0_ _ SM_ sm_  (~_ !sJ!78_ 
_ _ _ _B	c3h 	 	 	 	 	 	 	 	 	 	rQ   r   )r   typingr   r   r   r   rb   audio_utilsr   r   r	   r
   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerr   r   r   __all__rD   rQ   rP   <module>r      s    0 / ' ' ' ' ' ' ' ' ' '     \ \ \ \ \ \ \ \ \ \ \ \ I I I I I I 4 4 4 4 4 4 9 9 9 9 9 9 9 9 9 9 
	H	%	%k k k k k6 k k k\ %
%rQ   