
     `iY-                         d Z ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZmZ  ej        e          Z G d d	e          Zd	gZdS )
z&
Feature extractor class for Wav2Vec2
    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                       e Zd ZdZddgZ	 	 	 	 	 d fd		Ze	 ddeej	                 deej	                 d
e
deej	                 fd            Z	 	 	 	 	 	 	 ddeej	        ee
         eej	                 eee
                  f         deeeef         dee         dedee         dee         deeeef                  dee         defdZ xZS )Wav2Vec2FeatureExtractora  
    Constructs a Wav2Vec2 feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models, *e.g.*,
            [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.

            <Tip>

            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
            should be passed.

            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
            [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
            passed for batched inference.

            </Tip>input_valuesattention_mask   >          FTc                 ^     t                      j        d|||d| || _        || _        d S )N)feature_sizesampling_ratepadding_value )super__init__return_attention_maskdo_normalize)selfr   r   r   r   r   kwargs	__class__s          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.pyr   z!Wav2Vec2FeatureExtractor.__init__C   sC     	wl-_lwwpvwww%:"(    r   returnc                    |t          j        |t           j                  }g }t          | |                    d                    D ]\  }}||d|                                         z
  t          j        |d|                                         dz             z  }||j        d         k     r|||d<   |	                    |           nd | D             }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        NHz>r   c                     g | ]C}||                                 z
  t          j        |                                d z             z  DS )r#   )meannpsqrtvar).0xs     r   
<listcomp>zDWav2Vec2FeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>b   s@    "b"b"bPQALBGAEEGGdN4K4K#K"b"b"br   )
r&   arrayint32zipsumr%   r'   r(   shapeappend)r   r   r   normed_input_valuesvectorlengthnormed_slices          r   zero_mean_unit_var_normz0Wav2Vec2FeatureExtractor.zero_mean_unit_var_normP   s     %Xnbh??N"$"%lN4F4Fr4J4J"K"K 9 9 &)=)=)?)? ?276RYSYRY?K^K^K`K`cgKgChChhL.q111,9L)#**<88889 #c"bUa"b"b"b""r   N
raw_speechpadding
max_length
truncationpad_to_multiple_ofr   return_tensorsr   c	                    |2|| j         k    r&t          d|  d| j          d| j          d| d	          n(t                              d| j        j         d           t          |t          j                  ot          |j
                  d	k    }
|
r*t          |j
                  d
k    rt          d|            |
pHt          |t          t          f          o,t          |d         t          j        t          t          f          }|s|g}t          d|i          }|                     ||||||          }|d         }t          |d         t          j                  sd |D             |d<   nt          |t          j                  s[t          |d         t          j                  r;|d         j        t          j        t          j                  u rd |D             |d<   nat          |t          j                  rG|j        t          j        t          j                  u r"|                    t          j                  |d<   |                    d          }|d |D             |d<   | j        rM|                     ||          t,          j        ur|nd}|                     |d         || j                  |d<   ||                    |          }|S )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
                [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
                `attention_mask` should be passed.

                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
                [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
                be passed for batched inference.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
            padding_value (`float`, *optional*, defaults to 0.0):
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r      z2Only mono-channel audio is supported for input to r   r   )r8   r9   r:   r;   r   c                 N    g | ]"}t          j        |t           j                   #S )dtype)r&   asarrayfloat32r)   r,   s     r   r+   z5Wav2Vec2FeatureExtractor.__call__.<locals>.<listcomp>   s*    ,k,k,kUZRZRZ-P-P-P,k,k,kr   c                 L    g | ]!}|                     t          j                  "S r   )astyper&   rD   rE   s     r   r+   z5Wav2Vec2FeatureExtractor.__call__.<locals>.<listcomp>   s&    ,`,`,`%U\\"*-E-E,`,`,`r   r   c                 N    g | ]"}t          j        |t           j                   #S rA   )r&   rC   r-   rE   s     r   r+   z5Wav2Vec2FeatureExtractor.__call__.<locals>.<listcomp>   s*    .m.m.mUZrz%rx/P/P/P.m.m.mr   )r9   )r   r   )r   
ValueErrorloggerwarningr   __name__
isinstancer&   ndarraylenr0   listtupler   padrB   float64rG   rD   getr   _get_padding_strategiesr   
DO_NOT_PADr6   r   convert_to_tensors)r   r7   r8   r9   r:   r;   r   r<   r   r   is_batched_numpy
is_batchedencoded_inputspadded_inputsr   r   s                   r   __call__z!Wav2Vec2FeatureExtractor.__call__f   sa   L $ 222 F$ F F*F F*F F5BF F F   3 NN\W[WeWn \ \ \  
 &j"*==[#jFVBWBWZ[B[ 	ZJ$4 5 5 9 9XRVXXYYY% 
zD%=11lz*Q-RTR\^ceiQj7k7k 	
  	&$J &~z&BCC!!1"7 ! 
 
 %^4,q/2:66 		L,k,k^j,k,k,kM.))<44	L<?BJ77	L Q%"*)=)===,`,`S_,`,`,`M.))bj11 	Ll6HBHUWU_L`L`6`6`,8,?,?
,K,KM.) '**+;<<%.m.m^l.m.m.mM*+  	 //J/OOWfWqqq  
 -1,H,Hn-n\`\n -I - -M.) %)<<^LLMr   )r   r   r   FT)r   )FNFNNNN)rL   
__module____qualname____doc__model_input_namesr   staticmethodrP   r&   rN   floatr6   r   boolstrr   r   intr	   r   r\   __classcell__)r   s   @r   r   r      s        B ()9: #) ) ) ) ) ) ad# #2:&#8<RZ8H#Y^#	bj	# # # \#0 6;$( ,004;?'+J J"*d5k4
3CT$u+EVVWJ tS/12J SM	J
 J %SMJ  (~J !sJ!78J  }J 
J J J J J J J Jr   r   )r_   typingr   r   numpyr&   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   
get_loggerrL   rJ   r   __all__r   r   r   <module>rn      s     # " " " " " " "     I I I I I I 4 4 4 4 4 4 9 9 9 9 9 9 9 9 9 9 
	H	%	%Q Q Q Q Q7 Q Q Qh &
&r   