
     `i;                        d dl Z d dlmZ d dlmZmZ d dlZddlm	Z	 ddl
mZ ddlmZmZmZ  ej        e          Z	 dded	ed
ededededee         dej        fdZdej        dedededej        f
dZ G d de	          ZdgZdS )    N)Sequence)OptionalUnion   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingn_freqsf_minf_maxn_melssample_rate
fft_lengthnormreturnc                 2   ||dk    rt          d          t          j        | t          j                  ||z  z  }dt	          j        d|dz  z             z  }dt	          j        d|dz  z             z  }	t          j        ||	|dz             }
dd	|
dz  z  dz
  z  }|d
d         |dd         z
  }t          j        |d          t          j        |d
          z
  }t          j        d
t          j                  }d|ddddf         z  |dd         z  }|ddddf         |d
d         z  }t          j	        |t          j
        ||                    }|9|dk    r3d|d|dz            |d|         z
  z  }|t          j        |d          z  }|S )a  Create a frequency bin conversion matrix (NumPy version).

    Args:
        n_freqs (int): Number of frequencies to highlight/apply
        f_min (float): Minimum frequency (Hz)
        f_max (float): Maximum frequency (Hz)
        n_mels (int): Number of mel filterbanks
        sample_rate (int): Sample rate of the audio waveform
        fft_length (int): FFT length
        norm (Optional[str]): If 'slaney', divide the triangular mel weights by
          the width of the mel band (area normalization). (Default: ``None``)

    Returns:
        np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``,
        ``n_mels``)
        meaning number of frequencies to highlight/apply to x the number of
        filterbanks.
        Each column is a filterbank so that assuming there is a matrix A of
        size (..., ``n_freqs``), the applied result would be
        ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``.
    Nslaneyz$norm must be one of None or 'slaney'dtypeg     F@      ?g     @   
      r   g      g       @)
ValueErrornparangefloat32mathlog10linspaceexpand_dimszerosmaximumminimum)r   r   r   r   r   r   r   	all_freqsm_minm_maxm_ptsf_ptsf_diffslopeszerodown_slopes	up_slopesfbenorms                      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.pycreate_fb_matrixr6      s   > DH,,?@@@ 	'444j8PQI TZuu} 5666ETZuu} 5666EKufqj11EREFN+c12E122Yss#F^E1%%y!(D(DDF8ARZ(((D&CRC.(F3B3K7Kqqq!""uqrr
*I	D"*[)<<	=	=BDH,,uQ!^,uWfW~=>
bnUA&&&I    array	dimensionsizestepc                    | j         dk    rt          d          |dk    r|| j         dz
  k    rt          d          | j        \  }}||z
  |z  dz   }|dk    rt          j        |d|f| j                  S |||f}| j        d         | j        d         |z  | j        d         f}t          j        j        	                    | ||          S )	zNA basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim.r   zFThis unfold implementation currently supports 2D arrays (batch, time).r   r   zFThis unfold implementation only supports unfolding the last dimension.r   r   )shapestrides)
ndimr   r=   r   r&   r   r>   libstride_tricks
as_strided)	r8   r9   r:   r;   
batch_sizeoriginal_length
num_framesoutput_shapeoutput_stridess	            r5   _unfoldrH   [   s    zQabbbB9
Q66abbb"'+J!D(T1A5JQxQ-U[AAAA
D1LmA&a(84(?qAQRN6**5n*]]]r7   c            #           e Zd ZdZddgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d.dededededededededededededededee	e                  dee	e                  f  fd Z
d!ej        d"ej        d#eej        ej        f         fd$Z	 	 	 	 	 	 d/d'eej        ee         eej                 eee                  f         d(eeeef         d)ee         d*ed+ee         d,eeeef                  dee         d#efd-Z xZS )0Gemma3nAudioFeatureExtractoraT
  An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037.

    Args:
        feature_size (`int`, *optional*, defaults to 128):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention mask for the generated MEL spectrograms.
        frame_length_ms (`float`, *optional*, defaults to 32.0):
            The length of a frame in milliseconds.
        hop_length_ms (`float`, *optional*, defaults to 10.0):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        min_frequency (`float`, *optional*, defaults to 125.0):
            The minimum frequency (in Hz) for the Mel filterbank.
        max_frequency (`float`, *optional*, defaults to 7600.0):
            The maximum frequency (in Hz) for the Mel filterbank.
        preemphasis (`float`, *optional*, defaults to 0.97):
            The preemphasis coefficient.
        preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
            Whether to use HTK-style preemphasis.
        fft_overdrive (`bool`, *optional*, defaults to `True`):
            Whether to use FFT overdrive.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
        input_scale_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor applied to the input waveform.
        mel_floor (`float`, *optional*, defaults to 1e-05):
            Minimum value for Mel spectrograms to avoid log(0).
        per_bin_mean (`Optional[Sequence[float]]`, *optional*):
            Mean values for per-bin normalization.
        per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
            Standard deviation values for per-bin normalization.
    input_featuresinput_features_mask   >          T      @@      $@     @_@     @
ףp=
?r   h㈵>Nfeature_sizesampling_ratepadding_valuereturn_attention_maskframe_length_mshop_length_msmin_frequencymax_frequencypreemphasispreemphasis_htk_flavorfft_overdriveditherinput_scale_factor	mel_floorper_bin_meanper_bin_stddevc           	      <    t                      j        d||||d| || _        || _        |	| _        |
| _        || _        || _        || _        t          t          ||z  dz                      | _        t          t          ||z  dz                      | _        t          j        |t          j                  | _        dt#          j        t#          j        | j                            z  }| j        r|dz  }|| _        t          j        | j        t          j                  }ddt          j        dt          j        z  |z  | j        z            z
  z  }|                    t          j                  | _        t7          | j        dz  dz   |||| j        d |          | _        |/t          j        |                              dd|          | _        nd | _        |0t          j        |                              dd|          | _         d S d | _         d S )	N)rV   rW   rX   rY   g     @@r   r   g      ?r   )r   r   r   r   r   r   r    )!super__init__r\   r]   r^   r_   r`   ra   rb   introundframe_length
hop_lengthr   r8   float64rc   r"   ceillog2r   r    r!   cospiastypewindowr6   rW   mel_filtersreshaperd   re   )selfrV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   kwargsr   hann_arangert   	__class__s                        r5   ri   z%Gemma3nAudioFeatureExtractor.__init__   s   ( 	 	
%''"7		
 	

 	
 	
 	
 +*&&<#*"4mo&E&N O OPPeMM$AF$JKKLL)2:>>>$)DId.?$@$@AAA
 	!OJ$i 1DDDBF1ru9{#:T=N#NOOOPmmBJ//+Oq(1,*!
 
 
 # " 6 6 > >q!\ R RD $D%"$(>":":"B"B1a"V"VD"&Dr7   waveformattention_maskr   c                 b   |j         dk    rt          j        |d          }| j        dk    r;|| j        t          j        j        |j                             |j                  z  z   }| j	        dk    r
|| j	        z  }| j
        dz   }t          |d|| j                  }| j        dk    rz| j        rQ|dd	df         d| j        z
  z  }|dddf         | j        |dd	d
f         z  z
  }t          j        ||gd          }n.|ddd	f         | j        |dd	df         z  z
  }n|dd	df         }|| j        z  }t          j                            || j        d          }t          j        |          }	t          j        |	| j                  }
t          j        t          j        |
| j                            }| j        
|| j        z
  }| j        
|| j        z  }|                    d          }|d	d	| j                                     t:                    }||d	|j        d                  fS ) r   r   )axisrO   r   r   )r9   r:   r;   .Nr   )nr   )r?   r   r%   ra   randomrandnr=   rs   r   rb   rl   rH   rm   r^   r_   concatenatert   fftrfftr   absmatmulru   logr'   rc   rd   re   squeezebool)rw   r{   r|   frame_size_for_unfoldframes_to_processfirst_in_framerest_in_frameframesstftmagnitude_specmel_speclog_mel_specmel_spectrogrammasks                 r5   _extract_spectrogramz1Gemma3nAudioFeatureExtractor._extract_spectrogram   sV   =A~hQ777H;$+	0P0W0WX`Xf0g0g"ggH"c))$"99H $ 1A 5 $HAV]a]lmmmc!!* e!237!;sTEU?U!V 1#qt) <t?ORcdgiljlildlRm?m m(GbQQQ*373d6FIZ[^`cac`c[cId6dd&sCRCx0F$+%v{{6T_2{>>9^T-=>>vbj4>BBCC('$*;;L*'$*==L&..q11000188>>%?'<Q'?%? @@@r7   longest S 
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc                 "   t          |t          j                  ot          |j                  dk    }	t          |t
                    o&t          |d         t          j        t
          f          }
|	p|
}|rd |D             }n0|s.t          |t          j                  st          j        |          }|st          j        |g          g}|                     t          d|i          |||||          }g }g }t          |j
        |j                  D ]j\  }}|                     |j        |          \  }}|                    |                    t          j                             |                    |           kt          ||d|          S )a  Creates a batch of MEL spectrograms from the provided raw speech.

        This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
        `transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
        carefully when selecting an audio feature extractor, especially with pre-trained models.

        Args:
            raw_speech:
                The audio for which MEL spectrograms are created.
            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
                The padding strategy to use for batches of audio with different lengths.
            max_length (`int`, *optional*, defaults to 480000):
                If provided, defines the maximum length of the audio to allow. Audio longer than this will be
                truncated if `truncation=True`.
            truncation (`bool`, *optional*, defaults to `True`):
                Whether or not to truncate audio above `max_length`.
            pad_to_multiple_of (`int`, *optional*, defaults to 128):
                When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
            return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
                The type of tensors to return (e.g., NumPy, Torch, JAX, TensorFlow).
            return_attention_mask (`bool`, *optional*, defaults to `True`):
                Whether to return the attention mask for the generated MEL spectrograms.
        r   r   c                 B    g | ]}t          j        |g          j        S rg   )r   asarrayT).0rss     r5   
<listcomp>z9Gemma3nAudioFeatureExtractor.__call__.<locals>.<listcomp>5  s'    BBB"*bT**,BBBr7   rK   )r   r   r   r   rY   )rK   rL   )tensor_type)
isinstancer   ndarraylenr=   r   r   padr   ziprK   r|   r   r   appendrs   r!   )rw   r   r   r   r   r   r   rY   rx   is_batched_numpyis_batched_sequence
is_batchedbatched_speechprepared_speechprepared_speech_maskspeechr   s                    r5   __call__z%Gemma3nAudioFeatureExtractor.__call__  s   F &j"*==[#jFVBWBWZ[B[(X>>t:jYZm^`^hjr]sCtCt%<)<
 	0BBzBBBJJ 	0Jz2:$F$F 	0J//J 	4*j\223J*J788!!1"7 " 
 
 ! =~?\]] 	. 	.LFD44VXtDDLFD""6==#<#<=== ''----.G[\\&
 
 
 	
r7   )rM   rN   rO   TrP   rQ   rR   rS   rT   TTrO   r   rU   NN)r   r   TrM   NT)__name__
__module____qualname____doc__model_input_namesrj   floatr   r   r   ri   r   r   tupler   r   liststrr	   r
   r   r   __classcell__)rz   s   @r5   rJ   rJ   n   ss       ) )V *+@A  #"&*!%#$%!'+"$'2648#B' B'B' B' 	B'
  $B' B' B' B' B' B' !%B' B' B' "B' B'  x/!B'" !%1#B' B' B' B' B' B'H+ARZ +A +AX]^`^hjljt^tXu +A +A +A +A` 6?$+,/;?04B
 B
"*d5k4
3CT$u+EVVWB
 tS/12B
 SM	B

 B
 %SMB
 !sJ!78B
  (~B
 
B
 B
 B
 B
 B
 B
 B
 B
r7   rJ   )N)r"   collections.abcr   typingr   r   numpyr   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr	   r
   r   
get_loggerr   loggerrj   r   r   r   r6   rH   rJ   __all__rg   r7   r5   <module>r      s     $ $ $ $ $ $ " " " " " " " "     I I I I I I 4 4 4 4 4 4 9 9 9 9 9 9 9 9 9 9 
	H	%	% : ::: : 	:
 : : 3-: Z: : : :z^2: ^# ^S ^ ^
 ^ ^ ^ ^&a
 a
 a
 a
 a
#; a
 a
 a
H *
*r7   