
     `i+                         d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	  e            rd dl
Z
ddlmZ dd	lmZ d
Z G d de	          ZdS )    )AnyUnionoverload   )GenerationConfig)is_torch_available   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc            
       h    e Zd ZdZdZdZdZdZdZdZ e	d          Z
dddd fd
Zd	 Zd
 Zedededeeef         fd            Zedee         dedeeeef                  fd            Zdeeee         f         deeeef         eeeef                  f         f fdZ	 	 	 ddZd Z xZS )TextToAudioPipelinea  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    TF   )max_new_tokensN)vocodersampling_rateno_processorc                d    t                      j        |i | || _        | j        dk    rt	          d          d | _        | j        j        t          j	                    v r?|6t          j        t                                        | j        j                  n|| _        || _        | j        | j        j        j        | _        | j        | j        j        }| j        j                            dd           }|'|                    |                                           dD ]M}t+          ||d           }||| _        t+          |dd           t+          |j        |d           }||| _        N| j        4| j        s/t/          | j        d          r| j        j        j        | _        d S d S d S d S )Ntfz5The TextToAudioPipeline is only available in PyTorch.generation_config)sample_rater   codec_configfeature_extractor)super__init__r   	framework
ValueErrorr   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodevicer   config__dict__getupdateto_dictgetattrr   hasattr	processorr   )
selfr   r   r   argskwargsr%   
gen_configsampling_rate_namer   s
            x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.pyr   zTextToAudioPipeline.__init__a   s   $)&))) )>T!!TUUU:#H#O#Q#QQQ ?  /0BCCFFtzGXYYY L +<#!%!4!BD% Z&F,001DdKKJ%j0022333&F ; ;" '0BD I I ,)6D&&V^T::F$+F,?ASUY$Z$ZM$0-:* %d.?%GDN\oDpDp%!%!A!OD &%%%%%    c                    t          |t                    r|g}| j        j        j        dk    r=| j        j                            dd          ddddd}|                    |           |}| j	        r| j
        n| j        } ||fi |dd	i}|S )
Nbarkmax_input_semantic_lengthr   FT
max_length)r7   add_special_tokensreturn_attention_maskreturn_token_type_idspaddingreturn_tensorspt)
isinstancestrr   r%   
model_typer   semantic_configr'   r(   r   	tokenizerr,   )r-   textr/   
new_kwargspreprocessoroutputs         r2   
preprocesszTextToAudioPipeline.preprocess   s    dC   	6D:'611 #4DHHIdfijj&+)-).' J f%%%F)-):Nt~~dBBfBBTBBBr3   c                     |                      || j                  }|d         }|d         }| j                                        rT|                      || j                  }d|vr
| j        |d<   |                    |            | j        j        di ||}nHt          |          r$t          d|	                                            | j        di ||d         }| j
        | 
                    |          }|S )N)r$   forward_paramsgenerate_kwargsr   zYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r    )_ensure_tensor_on_devicer$   r   can_generater   r(   generatelenr   keysr   )r-   model_inputsr/   rI   rJ   rF   s         r2   _forwardzTextToAudioPipeline._forward   s;   ..vdk.JJ 01 !23:""$$ 	E";;OTXT_;``O #/997;7M 34 !!/222(TZ(JJ<J>JJFF?##  dKZK_K_KaKad d  
  TZAA,A.AA!DF<#\\&))Fr3   text_inputsrI   returnc                     d S NrK   r-   rS   rI   s      r2   __call__zTextToAudioPipeline.__call__   s    SVSVr3   c                     d S rV   rK   rW   s      r2   rX   zTextToAudioPipeline.__call__   s    _b_br3   c                 8     t                      j        |fi |S )a  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str` or `list[str]`):
                The text(s) to generate.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            A `dict` or a list of `dict`: The dictionaries have two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r   rX   )r-   rS   rI   r   s      r2   rX   zTextToAudioPipeline.__call__   s$    0  uww>>~>>>r3   c                     t          | dd           
| j        |d<   t          | dd           | j        |d<   | j        |d<   |r|ni |r|ni d}|i }i }|||fS )Nassistant_modelassistant_tokenizerrB   )rI   rJ   )r*   r\   rB   r]   )r-   preprocess_paramsrI   rJ   paramspostprocess_paramss         r2   _sanitize_parametersz(TextToAudioPipeline._sanitize_parameters   s     4*D11=151EO-.4.55A+/>OK(595MO12 1?FnnB2AIr
 

 $ " &*<<<r3   c                    i }| j         j        j        dk    rd}nd}| j        r?t	          |t
                    r	||         }n;t	          |t                    r	|d         }n|}n| j                            |          }t	          |t                    rd |D             |d<   n6|
                    dt          j                                                  |d<   | j        |d<   |S )	Ncsmaudiowaveformr   c                 t    g | ]5}|                     d t          j                                                  6S )cpur$   dtype)r#   torchfloatnumpy).0els     r2   
<listcomp>z3TextToAudioPipeline.postprocess.<locals>.<listcomp>  s7    #f#f#fWYBEEekE$J$J$P$P$R$R#f#f#fr3   rg   rh   r   )r   r%   r@   r   r>   dicttupler,   decodelistr#   rj   rk   rl   r   )r-   rd   output_dictwaveform_keyre   s        r2   postprocesszTextToAudioPipeline.postprocess   s    :'500"LL%L  		4%&& ! .E5)) ! 8  ~,,U33HeT"" 	X#f#f]e#f#f#fK  #+;;e5;;#O#O#U#U#W#WK '+'9O$r3   )NNN)__name__
__module____qualname____doc___load_processor_pipeline_calls_generate_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   rG   rR   r   r?   r   rp   rX   rs   r   ra   rv   __classcell__)r   s   @r2   r   r      s       2 2j O#O!#O "2!1" " " '+$T (P (P (P (P (P (P (PT  0  B VCV3V4S>VVV XVbDIbbdSVX[S[nI]bbb Xb? d3i0?	tCH~tDcN33	4? ? ? ? ? ?8 	= = = =.      r3   r   )typingr   r   r   
generationr   utilsr   baser
   rj   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r"   r   rK   r3   r2   <module>r      s    ( ' ' ' ' ' ' ' ' ' ) ) ) ) ) ) & & & & & &        DLLLQQQQQQCCCCCC1 w w w w w( w w w w wr3   