
     `it                         d dl mZ d dlmZmZ d dlZd dlZddlm	Z	m
Z
 ddlmZ ddlmZmZ  e
j        e          Z e	 ed	d	
                     G d de                      ZdS )    )UserDict)AnyUnionN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)has_feature_extractorhas_tokenizerc            	            e Zd ZdZdZdZdZdZ fdZde	e
j        eeef         dedeeeef                  f fdZd	 ZddZd Zd Z xZS )#ZeroShotAudioClassificationPipelinea  
    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
    provide an audio and a set of `candidate_labels`.

    <Tip warning={true}>

    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

    </Tip>

    Example:
    ```python
    >>> from transformers import pipeline
    >>> from datasets import load_dataset

    >>> dataset = load_dataset("ashraq/esc50")
    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
    ```


    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-audio-classification"`. See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
    FTc                      t                      j        di | | j        dk    rt          d| j         d          d S )NptzThe z is only available in PyTorch. )super__init__	framework
ValueError	__class__)selfkwargsr   s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.pyr   z,ZeroShotAudioClassificationPipeline.__init__D   sQ    ""6""">T!!RDNRRRSSS "!    audiosr   returnc                 8     t                      j        |fi |S )a  
        Assign labels to the audio(s) passed as inputs.

        Args:
            audios (`str`, `list[str]`, `np.array` or `list[np.array]`):
                The pipeline handles three types of inputs:
                - A string containing a http link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in numpy
            candidate_labels (`list[str]`):
                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                The format used in conjunction with *candidate_labels* to attempt the audio classification by
                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                already formatted.
        Return:
            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
            following keys:
            - **label** (`str`) -- One of the suggested *candidate_labels*.
            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                0 and 1, computed as the `softmax` of `logits_per_audio`.
        )r   __call__)r   r   r   r   s      r   r    z,ZeroShotAudioClassificationPipeline.__call__K   s$    .  uww11&111r   c                 L    i }d|v r|d         |d<   d|v r|d         |d<   |i i fS )Ncandidate_labelshypothesis_templater   )r   r   preprocess_paramss      r   _sanitize_parametersz8ZeroShotAudioClassificationPipeline._sanitize_parametersd   sO    ''4:;M4N01 F**7=>S7T34 "b((r   NThis is a sound of {}.c                 ,   t          |t                    r|                    d          s|                    d          rt          j        |          j        }n<t          |d          5 }|                                }d d d            n# 1 swxY w Y   t          |t                    rt          || j
        j                  }t          |t          j                  st          d          t          |j                  dk    rt#          d          | 
                    |g| j
        j        d          }| j        dk    r|                    | j                  }||d	<   fd
|D             }|                     || j        d          }|g|d<   |S )Nzhttp://zhttps://rbz"We expect a numpy ndarray as inputr	   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipeliner   )sampling_ratereturn_tensorsr"   c                 :    g | ]}                     |          S r   )format).0xr#   s     r   
<listcomp>zBZeroShotAudioClassificationPipeline.preprocess.<locals>.<listcomp>   s(    MMMq(//22MMMr   T)r*   paddingtext_inputs)
isinstancestr
startswithrequestsgetcontentopenreadbytesr
   feature_extractorr)   npndarray	TypeErrorlenshaper   r   todtype	tokenizer)r   audior"   r#   finputs	sequencesr1   s      `    r   
preprocessz.ZeroShotAudioClassificationPipeline.preprocessm   s   eS!! 	%	** %e.>.>z.J.J % !U++3%&& %!FFHHE% % % % % % % % % % % % % % % eU## 	Mt'='KLLE%,, 	B@AAAu{q  mnnn''G4#9#GX\ ( 
 
 >T!!YYtz**F%5!"MMMM<LMMM	nnYt~W[n\\!,}s   +BBBc                     |                     d          }|                     d          }t          |d         t                    r	|d         }n|d         d         } | j        di ||}||j        d}|S )Nr"   r1   r   )r"   logitsr   )popr2   r   modellogits_per_audio)r   model_inputsr"   r1   outputsmodel_outputss         r   _forwardz,ZeroShotAudioClassificationPipeline._forward   s    '++,>??"&&}55k!nh// 	,%a.KK &a.+K$*;;{;l;; !1.
 
 r   c                 *   |                     d          }|d         d         }| j        dk    r+|                    d          }|                                }nt	          d          d t          t          ||          d 	          D             }|S )
Nr"   rJ   r   r   )dimz`tf` framework not supported.c                     g | ]
\  }}||d S ))scorelabelr   )r-   rU   candidate_labels      r   r/   zCZeroShotAudioClassificationPipeline.postprocess.<locals>.<listcomp>   s4     
 
 
& o66
 
 
r   c                     | d          S )Nr   r   )r.   s    r   <lambda>zAZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>   s    _`ab_c^c r   )key)rK   r   softmaxtolistr   sortedzip)r   rP   r"   rJ   probsscoresresults          r   postprocessz/ZeroShotAudioClassificationPipeline.postprocess   s    (,,-?@@x(+>T!!NNqN))E\\^^FF<===
 
*0V=M1N1NTcTc*d*d*d
 
 
 r   )Nr&   )__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r<   r=   r:   r3   dictr   listr    r%   rH   rQ   rb   __classcell__)r   s   @r   r   r       s         : O!"OT T T T T2uRZT%AB 2c 2VZ[_`ceh`h[iVj 2 2 2 2 2 22) ) )   :  "      r   r   )collectionsr   typingr   r   numpyr<   r5   utilsr   r   audio_classificationr
   baser   r   
get_loggerrc   loggerr   r   r   r   <module>rv      s   !                               . - - - - - 4 4 4 4 4 4 4 4 
	H	%	% ,,4W[\\\]]H H H H H( H H ^]H H Hr   