
     `i+                     
   d dl Z d dlmZmZ d dlZd dlZddlmZm	Z	m
Z
mZmZ ddlmZmZ  e	            rddlmZ  ej        e          Zded	ed
ej        fdZ e ed                     G d de                      ZdS )    N)AnyUnion   )add_end_docstringsis_torch_availableis_torchaudio_availableis_torchcodec_availablelogging   )Pipelinebuild_pipeline_init_args),MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMESbpayloadsampling_ratereturnc                 ~   | }d}d}dddd|d|d|d	d
ddg}	 t          j        |t           j        t           j                  }n# t          $ r t	          d          w xY w|                    |           }|d         }t          j        |t          j                  }	|	j	        d         dk    rt	          d          |	S )z?
    Helper function to read an audio file through ffmpeg.
    1f32leffmpegz-izpipe:0z-acz-arz-fz-hide_bannerz	-loglevelquietzpipe:1)stdinstdoutzFffmpeg was not found but is required to load audio files from filenamer   zMalformed soundfile)

subprocessPopenPIPEFileNotFoundError
ValueErrorcommunicatenp
frombufferfloat32shape)
r   r   aracformat_for_conversionffmpeg_commandffmpeg_processoutput_stream	out_bytesaudios
             /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/pipelines/audio_classification.pyffmpeg_readr,      s     	B	B#

N c#).
XbXghhh c c cabbbc"..x88Ma IM)RZ00E{1~.///Ls   +A AT)has_feature_extractorc            	            e Zd ZdZdZdZdZdZ fdZde	e
j        eeef         dedeeeef                  f fdZdd
Zd Zd ZddZ xZS )AudioClassificationPipelinea  
    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
    raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
    formats.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)


    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"audio-classification"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
    FTc                     d|v r|d         d |d<   n	d|vrd|d<    t                      j        |i | | j        dk    rt          d| j         d          |                     t                     d S )Ntop_k   ptzThe z is only available in PyTorch.)super__init__	frameworkr   	__class__check_model_typer   )selfargskwargsr7   s      r+   r5   z$AudioClassificationPipeline.__init__b   s    f!8"F7OOF""F7O$)&)))>T!!RDNRRRSSSJKKKKK    inputsr;   r   c                 8     t                      j        |fi |S )a  
        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
        information.

        Args:
            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                The inputs is either :
                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
                      same way.
                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
                        Raw audio at the correct sampling rate (no further check will be done)
                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
                      `"array"` is used to denote the raw audio waveform.
            top_k (`int`, *optional*, defaults to None):
                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
                higher than the number of labels available in the model configuration, it will default to the number of
                labels.
            function_to_apply(`str`, *optional*, defaults to "softmax"):
                The function to apply to the model output. By default, the pipeline will apply the softmax function to
                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
                post-processing.

        Return:
            A list of `dict` with the following keys:

            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.
        )r4   __call__)r9   r=   r;   r7   s      r+   r?   z$AudioClassificationPipeline.__call__o   s%    D  uww11&111r<   Nc                     i }|| j         j        j        |d<   n+|| j         j        j        k    r| j         j        j        }||d<   ||dvrt          d| d          ||d<   nd|d<   i i |fS )Nr1   )softmaxsigmoidnonez'Invalid value for `function_to_apply`: z2. Valid options are ['softmax', 'sigmoid', 'none']function_to_applyrA   )modelconfig
num_labelsr   )r9   r1   rD   r;   postprocess_paramss        r+   _sanitize_parametersz0AudioClassificationPipeline._sanitize_parameters   s     =*.**;*Fw''tz(333
)4*/w'( (FFF G>O G G G   7H2336?232)))r<   c                 n   t          |t                    r|                    d          s|                    d          rt          j        |          j        }n<t          |d          5 }|                                }d d d            n# 1 swxY w Y   t          |t                    rt          || j
        j                  }t                      r?dd l}t          ||j                  r&|                                                                }t#                      rGdd l}dd l}t          ||j        j                  r%|                                }|j        }||j        d}t          |t0                    r$|                                }d|v rd|v sd|v st5          d	          |                    dd           }|,|                    d
d            |                    dd           }|                    d          }|}|| j
        j        k    rdd l}t9                      rddlm}	 nt?          d          |	                     t          |tB          j"                  r|#                    |          n||| j
        j                                                  }t          |tB          j"                  stI          d          tK          |j&                  dk    rt5          d          | 
                    || j
        j        d          }
| j'        |
(                    | j'                  }
|
S )Nzhttp://zhttps://rbr   )arrayr   r   rawrL   zWhen passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, containing the sampling_rate associated with that arraypath)
functionalztorchaudio is required to resample audio samples in AudioClassificationPipeline. The torchaudio package can be installed through: `pip install torchaudio`.z2We expect a numpy ndarray or torch tensor as inputr   zFWe expect a single channel audio input for AudioClassificationPipeliner3   )r   return_tensors)dtype))
isinstancestr
startswithrequestsgetcontentopenreadbytesr,   feature_extractorr   r   torchTensorcpunumpyr	   
torchcodecdecodersAudioDecoderget_all_samplesdatasample_ratedictcopyr   popr   
torchaudiorO   ImportErrorresampler   ndarray
from_numpy	TypeErrorlenr"   rQ   to)r9   r=   fr\   r`   _audio_samples_array_inputsin_sampling_rateF	processeds              r+   
preprocessz&AudioClassificationPipeline.preprocess   s   fc"" 	&  ++ &v/@/@/L/L & "f--5&$'' &1VVXXF& & & & & & & & & & & & & & & fe$$ 	O )?)MNNF 	.LLL&%,// .++--"$$ 	XLLL&*"5"BCC X!'!7!7!9!9',#)N<VWWfd## !	[[]]F $v--5F??gQWFWFW N   jj--G

64((( **Wd33%zz/::F4#9#GGG*,, :::::::%e  
 0:62:0N0NZE$$V,,,TZ$*8  %''	  &"*-- 	RPQQQv|!!efff**$"8"FW[ + 
 
	 :!!4:66Is   *BBBc                       | j         di |}|S )N )rE   )r9   model_inputsmodel_outputss      r+   _forwardz$AudioClassificationPipeline._forward   s    "
22\22r<   r2   rA   c                 t    |dk    r!|j         d                             d          }n3|dk    r |j         d                                         }n|j         d         }|                    |          \  }}|                                }|                                } fdt          ||          D             }|S )NrA   r   rB   c                 J    g | ]\  }}|j         j        j        |         d  S ))scorelabel)rE   rF   id2label).0r   _idr9   s      r+   
<listcomp>z;AudioClassificationPipeline.postprocess.<locals>.<listcomp>  s5    pppQ[QVX[EDJ,=,Fs,KLLpppr<   )logitsrA   rB   topktolistzip)r9   r|   r1   rD   probsscoresidslabelss   `       r+   postprocessz'AudioClassificationPipeline.postprocess   s    	))!(+33B77EE)++!(+3355EE!(+Ejj''jjllpppp_bcikn_o_opppr<   )NN)r2   rA   )__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr5   r   r   rl   rZ   rS   rf   r   listr?   rI   rx   r}   r   __classcell__)r7   s   @r+   r/   r/   B   s         2 O!"OL L L L L"2uRZT%AB "2c "2VZ[_`ceh`h[iVj "2 "2 "2 "2 "2 "2H* * * *,I I IV         r<   r/   )r   typingr   r   r_   r   rU   utilsr   r   r   r	   r
   baser   r   models.auto.modeling_autor   
get_loggerr   loggerrZ   intrl   r,   r/   rz   r<   r+   <module>r      sL                    u u u u u u u u u u u u u u 4 4 4 4 4 4 4 4  YXXXXXX		H	%	%!% ! !
 ! ! ! !H ,,4HHHIIC C C C C( C C JIC C Cr<   