
     `i                          d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZ  G d ded          Z G d de	          ZdgZdS )z%
Speech processor class for Wav2Vec2
    N)contextmanager)OptionalUnion   )ProcessingKwargsProcessorMixinUnpack)
AudioInputPreTokenizedInput	TextInput   )Wav2Vec2FeatureExtractor)Wav2Vec2CTCTokenizerc                       e Zd Zi ZdS )Wav2Vec2ProcessorKwargsN)__name__
__module____qualname__	_defaults     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/wav2vec2/processing_wav2vec2.pyr   r      s        IIIr   r   F)totalc            
            e Zd ZdZdZdZ fdZe fd            Z	 	 	 	 dde	e
         de	eeee         eef                  d	ee         fd
Zd Zed             Zed             Z xZS )Wav2Vec2Processora  
    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
    processor.

    [`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.

    Args:
        feature_extractor (`Wav2Vec2FeatureExtractor`):
            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
        tokenizer ([`PreTrainedTokenizer`]):
            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
    r   AutoTokenizerc                 r    t                                          ||           | j        | _        d| _        d S )NF)super__init__feature_extractorcurrent_processor_in_target_context_manager)selfr    	tokenizer	__class__s      r   r   zWav2Vec2Processor.__init__3   s7    *I666!%!7*/'''r   c                    	  t                      j        |fi |S # t          t          f$ rW t	          j        d| j         dt                     t          j        |fi |}t          j        |fi |} | ||          cY S w xY w)NzLoading a tokenizer inside a   from a config that does not include a `tokenizer_class` attribute is deprecated and will be removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'` attribute to either your `config.json` or `tokenizer_config.json` file to suppress this warning: )r    r$   )
r   from_pretrainedOSError
ValueErrorwarningswarnr   FutureWarningr   r   )clspretrained_model_name_or_pathkwargsr    r$   r%   s        r   r'   z!Wav2Vec2Processor.from_pretrained8   s    	Q*577*+HSSFSSS$ 	Q 	Q 	QM2cl 2 2 2
    !9 HIf q qjp q q,<=Zee^deeI3):iPPPPPP	Qs    A%BBNaudiotextr/   c                    d|v r)t          j        d           |                    d          }||t          d           | j        t
          fd| j        j        i|}| j        r$ | j	        |fi |d         |d         |d         S | | j
        |fi |d         }| | j        |fi |d         }||S ||S |d	         |d
<   |S )a*  
        This method forwards all arguments to [`Wav2Vec2FeatureExtractor.__call__`] and/or
        [`PreTrainedTokenizer.__call__`] depending on the input modality and returns their outputs. If both modalities are passed, [`Wav2Vec2FeatureExtractor.__call__`] and [`PreTrainedTokenizer.__call__`] are called.

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                An audio input is passed to [`Wav2Vec2FeatureExtractor.__call__`].
            text (`str`, `List[str]`, *optional*):
                A text input is passed to [`PreTrainedTokenizer.__call__`].


        Returns:
            This method returns the results of each `call` method. If both are used, the output is a dictionary containing the results of both.
        
raw_speechzLUsing `raw_speech` as a keyword argument is deprecated. Use `audio` instead.NzAYou need to specify either an `audio` or `text` input to process.tokenizer_init_kwargsaudio_kwargstext_kwargscommon_kwargs	input_idslabels)r*   r+   popr)   _merge_kwargsr   r$   init_kwargsr"   r!   r    )	r#   r0   r1   imagesvideosr/   output_kwargsinputs	encodingss	            r   __call__zWav2Vec2Processor.__call__K   sG   , 6!!MhiiiJJ|,,E=T\`aaa**#
 
"&."<
 
 
 * 	)4) /  .  0	   +T+ESS]>5RSSF&tLL}]/KLLI<M](5F8Mr   c                 Z   | j         r | j        j        |i |S |                    dd          }|                    dd          }t	          |          dk    r|d         }|dd         }| | j        j        |g|R i |}| | j        j        |fi |}||S ||S |d         |d<   |S )ag  
        This method operates on batches of extracted features and/or tokenized text. It forwards all arguments to
        [`Wav2Vec2FeatureExtractor.pad`] and/or [`PreTrainedTokenizer.pad`] depending on the input modality and returns their outputs. If both modalities are passed, [`Wav2Vec2FeatureExtractor.pad`] and [`PreTrainedTokenizer.pad`] are called.

        Args:
            input_features:
                When the first argument is a dictionary containing a batch of tensors, or the `input_features` argument is present, it is passed to [`Wav2Vec2FeatureExtractor.pad`].
            labels:
                When the `label` argument is present, it is passed to [`PreTrainedTokenizer.pad`].

        Returns:
            This method returns the results of each `pad` method. If both are used, the output is a dictionary containing the results of both.
        input_featuresNr9   r   r   r8   )r"   r!   padr:   lenr    r$   )r#   argsr/   rD   r9   s        r   rE   zWav2Vec2Processor.pad   s     * 	?-4)-t>v>>>$4d;;Hd++t99q==!!WN8D%7T37XXXXQWXXN'T^'99&99F>!!#M'-k':N8$!!r   c                 &    | j         j        }|dgz   S )Nr9   )r    model_input_names)r#   feature_extractor_input_namess     r   rI   z#Wav2Vec2Processor.model_input_names   s     )-(>(P%,z99r   c              #      K   t          j        d           d| _        | j        | _        dV  | j        | _        d| _        dS )z
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
        Wav2Vec2.
        z`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your audio inputs, or in a separate call.TNF)r*   r+   r"   r$   r!   r    )r#   s    r   as_target_processorz%Wav2Vec2Processor.as_target_processor   sW       	8	
 	
 	

 +/'!%!%!7*/'''r   )NNNN)r   r   r   __doc__feature_extractor_classtokenizer_classr   classmethodr'   r   r
   r   strlistr   r   r	   r   rB   rE   propertyrI   r   rL   __classcell__)r%   s   @r   r   r   !   s&         9%O0 0 0 0 0
 Q Q Q Q [Q( '+NR6 6
#6 uS$s)Y8IIJK6 016 6 6 6p#" #" #"J : : X:
 0 0 ^0 0 0 0 0r   r   )rM   r*   
contextlibr   typingr   r   processing_utilsr   r   r	   tokenization_utils_baser
   r   r   feature_extraction_wav2vec2r   tokenization_wav2vec2r   r   r   __all__r   r   r   <module>r\      s     % % % % % % " " " " " " " " H H H H H H H H H H O O O O O O O O O O A A A A A A 7 7 7 7 7 7    .e    \0 \0 \0 \0 \0 \0 \0 \0~ 
r   