
     `i5=                         d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZ  ej        e          Z G d de
          ZdgZdS )z
Processor class for Bark
    N)Optional   )BatchFeature)ProcessorMixin)BatchEncoding)logging)cached_file   )AutoTokenizerc                        e Zd ZdZdZdgZddddZd fd	Ze	 dd
            Z		 	 	 dde
f fdZddee         fdZddee         fdZedefd            Zdde
fdZ	 	 	 	 	 	 	 ddefdZ xZS )BarkProcessora	  
    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            An instance of [`PreTrainedTokenizer`].
        speaker_embeddings (`dict[dict[str]]`, *optional*):
            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
            a list of `voice_preset_names`.

    r   	tokenizer   r
   semantic_promptcoarse_promptfine_promptNc                 X    t                                          |           || _        d S N)super__init__speaker_embeddings)selfr   r   	__class__s      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bark/processing_bark.pyr   zBarkProcessor.__init__=   s)    ###"4    speaker_embeddings_path.jsonc                    |9t          |||                    dd          |                    dd          |                    dd          |                    dd          |                    dd          |                    dd          |                    d	d          |                    d
d          ddd          }|?t                              dt          j                            ||           d           d}n>t          |          5 }t          j	        |          }ddd           n# 1 swxY w Y   nd}|	d|v r||d<   t          j        |fi |} | ||          S )a  
        Instantiate a Bark processor associated with a pretrained model.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                  method, e.g., `./my_model_directory/`.
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file containing the speaker_embeddings dictionary located in
                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
            **kwargs
                Additional keyword arguments passed along to both
                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        N	subfolder	cache_dirforce_downloadFproxiesresume_downloadlocal_files_onlyuse_auth_tokenrevisionr   r    r!   r"   r#   r$   tokenr&    _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors`z` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`.repo_or_path)r   r   )r	   poploggerwarningospathjoinopenjsonloadr   from_pretrained)cls!pretrained_processor_name_or_pathspeaker_embeddings_dict_pathkwargsspeaker_embeddings_pathr   speaker_embeddings_jsonr   s           r   r7   zBarkProcessor.from_pretrainedB   s   . (3&11, **[$77 **[$77%zz*:EBB

9d33 &

+<d C C!',>!F!Fjj!1488J55166;8=' ' '# '.`"',,'HJfgg ` ` `  
 &*""122 L6M)-3J)K)K&L L L L L L L L L L L L L L L "&)!3335V">2!12S^^W]^^	sY;MNNNNs   D11D58D5r   Fpush_to_hubc                    | j         =t          j        t          j                            ||d          d           i }||d<   | j        D ]}|                     |          }i }	| j         |         D ]r}
t          j        t          j                            |d         || d|
           ||
         d           t          j                            || d|
 d	          |	|
<   s|	||<   t          t          j                            ||          d
          5 }t          j        ||           ddd           n# 1 swxY w Y    t                      j        ||fi | dS )a|  
        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
        using the [`~BarkProcessor.from_pretrained`] method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                if it does not exist).
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                The name of the folder in which the speaker_embeddings arrays will be saved.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        Nv2T)exist_okr-   _F)allow_picklez.npyw)r   r1   makedirsr2   r3   available_voice_presets_load_voice_presetnpsaver4   r5   dumpr   save_pretrained)r   save_directoryr:   speaker_embeddings_directoryr>   r;   embeddings_dict
prompt_keyvoice_presettmp_dictkeyfpr   s               r   rK   zBarkProcessor.save_pretrained}   s   8 ".K^5QSWXXcghhhh O.<ON+": 7 7
#66zBB2:> j jCG+N;=Y^h[p[pkn[p[p  %S)%*    %'GLL1MR\OhOh_bOhOhOh$i$iHSMM.6
++bgll>3OPPRUVV /Z\	/2.../ / / / / / / / / / / / / / / 	 FFvFFFFFs   D<<E E rP   c                    | j         |         }i }dD ]d}||vrt          d| d| d          t          | j                             dd          ||         |                    dd           |                    dd           |                    d	d
          |                    dd           |                    dd           |                    dd
          |                    dd           |                    dd           d
d
d
          }|St          dt
          j                            | j                             dd          ||                    d| d          t          j	        |          ||<   f|S )Nr   #Voice preset unrecognized, missing z% as a key in self.speaker_embeddings[z].r-   /r   r    r!   Fr"   r#   r$   r%   r&   r'   r,   z{` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the z 
                    embeddings.)
r   
ValueErrorr	   getr.   r1   r2   r3   rH   r6   )r   rP   r;   voice_preset_pathsvoice_preset_dictrR   r2   s          r   rG   z BarkProcessor._load_voice_preset   s   !4\BF 	3 	3C,,, t#ttdpttt   '++NC@@"3' **[$77 **[$77%zz*:EBB

9d33 &

+<d C C!',>!F!Fjj!1488J55166;8=  D | #"',,t'>'B'B>SV'W'WYkloYpqq # #jv# # #   &(WT]]c""  r   c           	         dD ]}||vrt          d| d          t          ||         t          j                  s-t	          | dt          | j        |                    d          t          ||         j                  | j        |         k    r-t          | dt          | j        |                    d          d S )Nr   rU   z
 as a key.z voice preset must be a z
D ndarray.)	rW   
isinstancerH   ndarray	TypeErrorstrpreset_shapelenshape)r   rP   rR   s      r   _validate_voice_preset_dictz)BarkProcessor._validate_voice_preset_dict   s    F 	j 	jC,&& !Vs!V!V!VWWWl3/<< i3 g gDDUVYDZ@[@[ g g ghhh<$*++t/@/EEE C!h!hTEVWZE[A\A\!h!h!hiii F	j 	jr   returnc                     | j         g S t          | j                                                   }d|v r|                    d           |S )z
        Returns a list of available voice presets.

        Returns:
            `list[str]`: A list of voice preset names.
        Nr-   )r   listkeysremove)r   voice_presetss     r   rF   z%BarkProcessor.available_voice_presets   sS     "*IT499;;<<]**  000r   Tremove_unavailablec                 T   g }| j         | j        D ]S}	 |                     |          }n%# t          $ r |                    |           Y :w xY w|                     |           T|r.t                              dt          |           d| d           |r|D ]}| j         |= d S d S d S )NzThe following z' speaker embeddings are not available: zU If you would like to use them, please check the paths or try downloading them again.)	r   rF   rG   rW   appendrc   r/   r0   ra   )r   rj   unavailable_keysrP   rZ   s        r   _verify_speaker_embeddingsz(BarkProcessor._verify_speaker_embeddings   s"   ". $ < D D(,(?(?(M(M%%!   $++L999H 001BCCCC kS)9%:%: k kcs k k k  
 " >$4 > >L/==% /. > >> >s   *AApt   c           
         |t          |t                    s~t          |t                    r&| j        || j        v r|                     |          }nCt          |t                    r|                    d          s|dz   }t          j        |          }| | j        |fi | t          ||          } | j
        |f|d||||d|}	|||	d<   |	S )a  
        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            voice_preset (`str`, `dict[np.ndarray]`):
                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
                `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
                it can be a valid file name of a local `.npz` single voice preset containing the keys
                `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
            If a voice preset is provided, the returned object will include a `"history_prompt"` key
            containing a [`BatchFeature`], i.e the voice preset with the right tensors type.
        Nz.npz)datatensor_type
max_length)return_tensorspaddingrt   return_attention_maskreturn_token_type_idsadd_special_tokenshistory_prompt)r\   dictr_   r   rG   endswithrH   r6   rc   r   r   )
r   textrP   ru   rt   ry   rw   rx   r;   encoded_texts
             r   __call__zBarkProcessor.__call__  s"   L #J|T,J,J#<--5+7 D$;;;#66|DD lC00 99N9Nv9V9V 9#/&#8L!w|44#,D,\DDVDDD'\~VVVL%t~	
) !"7"71	
 	
 	
 	
 #-9L)*r   r   )r   )r   r   F)T)NNro   rp   FTF)__name__
__module____qualname____doc__tokenizer_class
attributesr`   r   classmethodr7   boolrK   r   r_   rG   r{   rc   propertyrf   rF   rn   r   r   __classcell__)r   s   @r   r   r   $   s         &OJ  L5 5 5 5 5 5
 Mk8O 8O 8O [8Oz &D%9!6G 6G
 6G 6G 6G 6G 6G 6Gp"! "!x} "! "! "! "!H	j 	j 	j 	j 	j 	j     X> >T > > > >2  "#F F 
F F F F F F F Fr   r   )r   r5   r1   typingr   numpyrH   feature_extraction_utilsr   processing_utilsr   tokenization_utils_baser   utilsr   	utils.hubr	   autor   
get_loggerr   r/   r   __all__ r   r   <module>r      s     				           4 4 4 4 4 4 . . . . . . 4 4 4 4 4 4       $ $ $ $ $ $             
	H	%	%m m m m mN m m m`	 
r   