
     `ik:                         d Z ddlZddlmZ ddlmZ ddlmZ  ej        e	          Z
 G d de          Z G d	 d
e          Z G d de          Z G d de          ZdS )z#BARK model generation configuration    N)Optional   )GenerationConfig)loggingc                   D     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )BarkSemanticGenerationConfigsemantic'  T   F      ?@'  ; ?    33333H@Nc                      t                      j        d||	|||||||d	| |
| _        || _        || _        || _        || _        || _        || _        || _	        dS )a  Class that holds a generation configuration for [`BarkSemanticModel`].

        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
        documentation from [`GenerationConfig`] for more information.

        Args:
            eos_token_id (`int`, *optional*, defaults to 10_000):
                The id of the *end-of-sequence* token.
            renormalize_logits (`bool`, *optional*, defaults to `True`):
                Whether to renormalize the logits after applying all the logits processors (including the
                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
                score logits are normalized but some logit processors break the normalization.
            max_new_tokens (`int`, *optional*, defaults to 768):
                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            temperature (`float`, *optional*, defaults to 1.0):
                The value used to modulate the next token probabilities.
            do_sample (`bool`, *optional*, defaults to `False`):
                Whether or not to use sampling ; use greedy decoding otherwise.
            text_encoding_offset (`int`, *optional*, defaults to 10_048):
                Text encoding offset.
            text_pad_token (`int`, *optional*, defaults to 129_595):
                Text pad token.
            semantic_infer_token (`int`, *optional*, defaults to 129_599):
                Semantic infer token.
            semantic_vocab_size (`int`, *optional*, defaults to 10_000):
                Semantic vocab size.
            max_input_semantic_length (`int`, *optional*, defaults to 256):
                Max length of semantic input vector.
            semantic_rate_hz (`float`, *optional*, defaults to 49.9):
                Semantic rate in Hertz.
            min_eos_p (`float`, *optional*):
                Minimum threshold of the probability of the EOS token for it to be sampled. This is an early stopping
                strategy to mitigate potential unwanted generations at the end of a prompt. The original implementation
                suggests a default value of 0.2.
        )	temperature	do_sampleeos_token_idrenormalize_logitsmax_new_tokensoutput_scoresreturn_dict_in_generateoutput_hidden_statesoutput_attentionsN )
super__init__text_encoding_offsettext_pad_tokensemantic_pad_tokensemantic_infer_tokensemantic_vocab_sizemax_input_semantic_lengthsemantic_rate_hz	min_eos_p)selfr   r   r   r   r   r   r   r   r   r   r    r"   r#   r$   r%   r&   kwargs	__class__s                     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bark/generation_configuration_bark.pyr   z%BarkSemanticGenerationConfig.__init__   s    B 	 	
#%1)'$;!5/	
 	
 	
 	
 	
 %9!,".$8!#6 )B& 0"    )r
   Tr   FFFFr   Fr   r   r   r
   r   r   N)__name__
__module____qualname__
model_typer   __classcell__r)   s   @r*   r   r      s~        J  %"#$""%#U# U# U# U# U# U# U# U# U# U#r+   r   c                   J     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedef fdZ xZS )BarkCoarseGenerationConfigcoarse_acousticsTFr   /  K      /  r   v  <   max_coarse_historysliding_window_lenc                      t                      j        d|||||||d| || _        |	| _        |
| _        || _        || _        || _        || _        dS )as
  Class that holds a generation configuration for [`BarkCoarseModel`].

        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
        documentation from [`GenerationConfig`] for more information.

        Args:
            renormalize_logits (`bool`, *optional*, defaults to `True`):
                Whether to renormalize the logits after applying all the logits processors (including the
                custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the
                score logits are normalized but some logit processors break the normalization.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            temperature (`float`, *optional*, defaults to 1.0):
                The value used to modulate the next token probabilities.
            do_sample (`bool`, *optional*, defaults to `False`):
                Whether or not to use sampling ; use greedy decoding otherwise.
            coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048):
                Coarse semantic pad token.
            coarse_rate_hz (`int`, *optional*, defaults to 75):
                Coarse rate in Hertz.
            n_coarse_codebooks (`int`, *optional*, defaults to 2):
                Number of coarse codebooks.
            coarse_infer_token (`int`, *optional*, defaults to 12_050):
                Coarse infer token.
            max_coarse_input_length (`int`, *optional*, defaults to 256):
                Max length of input coarse vector.
            max_coarse_history (`int`, *optional*, defaults to 630):
                Max length of the output of the coarse acoustics model used in the fine generation step.
            sliding_window_len (`int`, *optional*, defaults to 60):
                The coarse generation step uses a sliding window to generate raw audio.
        )r   r   r   r   r   r   r   Nr   )	r   r   coarse_semantic_pad_tokencoarse_rate_hzn_coarse_codebookscoarse_infer_tokenmax_coarse_input_lengthr;   r<   )r'   r   r   r   r   r   r   r   r>   r?   r@   rA   rB   r;   r<   r(   r)   s                   r*   r   z#BarkCoarseGenerationConfig.__init__y   s    r 	 		
#1'$;!5/		
 		
 		
 		
 		
 *C&,"4"4'>$"4"4r+   )TFFFFr   Fr5   r6   r7   r8   r   r9   r:   )r,   r-   r.   r/   intr   r0   r1   s   @r*   r3   r3   v   s        #J   %""(! #"%"$J5 J5  J5  J5 J5 J5 J5 J5 J5 J5 J5 J5 J5r+   r3   c                   2     e Zd ZdZ	 	 	 	 d fd	Zd Z xZS )	BarkFineGenerationConfigfine_acousticsr            c                 v    t                                          |           || _        || _        || _        dS )a  Class that holds a generation configuration for [`BarkFineModel`].

        [`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the
        hood, it uses `temperature` when used by [`BarkModel`]

        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
        documentation from [`GenerationConfig`] for more information.

        Args:
            temperature (`float`, *optional*):
                The value used to modulate the next token probabilities.
            max_fine_history_length (`int`, *optional*, defaults to 512):
                Max length of the fine history vector.
            max_fine_input_length (`int`, *optional*, defaults to 1024):
                Max length of fine input vector.
            n_fine_codebooks (`int`, *optional*, defaults to 8):
                Number of codebooks used.
        )r   N)r   r   max_fine_history_lengthmax_fine_input_lengthn_fine_codebooks)r'   r   rK   rL   rM   r(   r)   s         r*   r   z!BarkFineGenerationConfig.__init__   s>    4 	[111'>$%:" 0r+   c                     dS )z
        Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside
        temperature.
        Nr   )r'   r(   s     r*   validatez!BarkFineGenerationConfig.validate   s	    
 	r+   )r   rG   rH   rI   )r,   r-   r.   r/   r   rO   r0   r1   s   @r*   rE   rE      s`        !J  #"1 1 1 1 1 1@      r+   rE   c                       e Zd ZdZ	 	 	 	 	 ddee         dee         dee         fdZedede	de
fd	            Zd
 ZdS )BarkGenerationConfigbarkN]  rH   semantic_configcoarse_acoustics_configfine_acoustics_configc                 <   |i }t                               d           |i }t                               d           |i }t                               d           t          di || _        t	          di || _        t          di || _        || _        || _	        dS )a$  Class that holds a generation configuration for [`BarkModel`].

        The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested
        [`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`],
        [`BarkFineGenerationConfig`].

        This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the
        documentation from [`GenerationConfig`] for more information.

        Args:
            semantic_config (`Dict`, *optional*):
                Semantic generation configuration.
            coarse_acoustics_config (`Dict`, *optional*):
                Coarse generation configuration.
            fine_acoustics_config (`Dict`, *optional*):
                Fine generation configuration.
            sample_rate (`int`, *optional*, defaults to 24_000):
                Sample rate.
            codebook_size (`int`, *optional*, defaults to 1024):
                Vector length for each codebook.
        NzMsemantic_config is None. initializing the semantic model with default values.zScoarse_acoustics_config is None. initializing the coarse model with default values.zOfine_acoustics_config is None. initializing the fine model with default values.r   )
loggerinfor   rT   r3   rU   rE   rV   sample_ratecodebook_size)r'   rT   rU   rV   rZ   r[   r(   s          r*   r   zBarkGenerationConfig.__init__   s    < " OKKghhh"*&(#KKmnnn ($&!KKijjj;NNoNN'A'\'\D['\'\$%=%V%V@U%V%V"&*r+   c                      | d|                                 |                                 |                                 d|S )z
        Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration.

        Returns:
            [`BarkGenerationConfig`]: An instance of a configuration object
        )rT   rU   rV   r   )to_dict)clsrT   rU   rV   r(   s        r*   from_sub_model_configsz+BarkGenerationConfig.from_sub_model_configs'  sY     s 
+3355$;$C$C$E$E"7"?"?"A"A
 
 	
 
 	
r+   c                     t          j        | j                  }| j                                        |d<   | j                                        |d<   | j                                        |d<   | j        j        |d<   |S )z
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].

        Returns:
            `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        rT   rU   rV   r/   )	copydeepcopy__dict__rT   r]   rU   rV   r)   r/   )r'   outputs     r*   r]   zBarkGenerationConfig.to_dict<  sw     t}--$($8$@$@$B$B !,0,H,P,P,R,R()*.*D*L*L*N*N&'#~8|r+   )NNNrS   rH   )r,   r-   r.   r/   r   dictr   classmethodr   r3   rE   r_   r]   r   r+   r*   rQ   rQ      s        J +/2604/+ /+!$/+ "*$/+  (~	/+ /+ /+ /+b 
5
 "<
  8	
 
 
 [
(    r+   rQ   )__doc__ra   typingr   generation.configuration_utilsr   utilsr   
get_loggerr,   rX   r   r3   rE   rQ   r   r+   r*   <module>rl      s3   * )        > > > > > >       
	H	%	%X# X# X# X# X##3 X# X# X#vM5 M5 M5 M5 M5!1 M5 M5 M5`( ( ( ( (/ ( ( (VY Y Y Y Y+ Y Y Y Y Yr+   