
     `i                         d Z ddlZddlmZmZ ddlZddlmZm	Z	m
Z
mZ ddlmZ ddlmZ  ej        e          Z G d d	e          Zd	gZdS )
zXcodec model configuration    N)OptionalUnion)
AutoConfig	DacConfigHubertConfigWavLMConfig   )PretrainedConfig)loggingc                       e Zd ZdZdZeedZdddddgddgddgdddd	ddfd
ee	e
                  dedede	e
         de	e         de	e         dededee         de
deeef         deeef         f fdZedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Z xZS )XcodecConfiga
  
    This is the configuration class to store the configuration of an [`XcodecModel`]. It is used to instantiate a
    Xcodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [Manel/X-Codec](https://huggingface.co/Manel/X-Codec) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`):
            The range of different bandwidths (in kbps) the model can encode audio with.
        sample_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio waveform should be digitalized, in hertz (Hz).
        kernel_size (`int`, *optional*, defaults to 3):
            Kernel size for the initial semantic convolution.
        channel_ratios (`List[float]`, *optional*, defaults to `[1, 1]`):
            Expansion factors for the number of output channels in each semantic block.
        strides (`List[int]`, *optional*, defaults to `[1, 1]`):
            Strides for each semantic encoder block.
        block_dilations (`List[int]`, *optional*, defaults to `[1, 1]`):
            Dilation factors for the residual units in semantic blocks.
        unit_kernel_size (`int`, *optional*, defaults to 3):
            Kernel size inside each ResidualUnit in semantic blocks.
        codebook_size (`int`, *optional*, defaults to 1024):
            Number of entries in each residual quantizer's codebook.
        codebook_dim (`int`, *optional*):
            Dimensionality of each codebook vector. Defaults to sum of hidden size of acoustic and semantic models.
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation of the truncated normal initializer for all weight matrices.
        acoustic_model_config (`Union[Dict, DacConfig]`, *optional*):
            An instance of the configuration for the acoustic (DAC) model.
        semantic_model_config (`Union[Dict, HubertConfig, WavLMConfig]`, *optional*):
            An instance of the configuration object for the semantic (HuBERT) model.

    Example:

    ```python
    >>> from transformers import XcodecModel, XcodecConfig

    >>> # Initializing configuration
    >>> configuration = XcodecConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = XcodecModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```xcodec)acoustic_model_configsemantic_model_configNi>  r	         g{Gz?target_bandwidthssample_ratekernel_sizechannel_ratiosstridesblock_dilationsunit_kernel_sizecodebook_sizecodebook_diminitializer_ranger   r   c                     t                      j        di | |t          dg ddg dd          | _        nct	          |t
                    rt          di || _        n<t	          |t                    r|| _        nt          dt          |                     |t                      | _	        nt	          |t
                    rPd|v r t          j        |d                   | _	        n}t                              d           t          di || _	        nQt	          |t                    st	          |t                    r|| _	        nt          d	t          |                     |g d
}|| _        || _        || _        || _        || _        || _        || _        || _        |
| _        |	| j        j        | j	        j        z   }	|	| _        d S )N@   )            r      )encoder_hidden_sizedownsampling_ratiosdecoder_hidden_sizeupsampling_ratioshidden_sizezDacoustic_model_config must be a dict or DacConfig instance, but got _name_or_pathz_Could not determine semantic model type from config architecture. Defaulting to `HubertConfig`.zUsemantic_model_config must be a dict, HubertConfig, or WavLMConfig instance, but got )g      ?r   g      ?r"   r!    )super__init__r   r   
isinstancedict
ValueErrortyper   r   r   from_pretrainedloggerwarningr   r   r   r   r   r   r   r   r   r   r(   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/xcodec/configuration_xcodec.pyr,   zXcodecConfig.__init__Y   sK     	""6""" ()2$& %1LL$(".,,* * *D&& -t44 	)2)K)K5J)K)KD&&-y99 	)>D&&tW[\qWrWrtt   !()5D&&-t44 	"777-7-GH]^mHn-o-o** u   .:-R-R<Q-R-R**-{;; 	zJ_am?n?n 	)>D&& Fhl  nC  iD  iD  F  F   $ 3 3 3!2&&,. 0*!25ADD^DjjL(    returnc                 D    t          j        | j        | j        z            S N)mathceilr   
hop_lengthr4   s    r7   
frame_ratezXcodecConfig.frame_rate   s    y)DO;<<<r8   c                     | j         j        S r;   )r   r(   r?   s    r7   semantic_hidden_sizez!XcodecConfig.semantic_hidden_size   s    )55r8   c                 X    t          t          j        | j        j                            S r;   )intnpprodr   r%   r?   s    r7   r>   zXcodecConfig.hop_length   s     2745IJJKKKr8   c                 X    t          j        t          j        | j                            S r;   )r<   r=   log2r   r?   s    r7   codebook_nbitszXcodecConfig.codebook_nbits   s    y4#566777r8   c                 4    | j         j        | j        j        z   S r;   )r   r(   r   r?   s    r7   r(   zXcodecConfig.hidden_size   s    )58R8^^^r8   c                 \    t          d| j        d         z  | j        | j        z  z            S )Ni  )rD   r   r@   rI   r?   s    r7   num_quantizerszXcodecConfig.num_quantizers   s,    4$0444K^9^_```r8   )__name__
__module____qualname____doc__
model_typer   r   sub_configsr   listfloatrD   r   r.   r   r,   propertyr@   rB   r>   rI   r(   rM   __classcell__)r6   s   @r7   r   r      sM       0 0d J "+!+ K 48 '(!fV&'V !!&*#'8<;?F) F)#DK0F) F) 	F)
 UF) cF) cF) F) F) smF) !F)  %T9_5F)  %T<%78F) F) F) F) F) F)P =C = = = X= 6c 6 6 6 X6 LC L L L XL 8 8 8 8 X8 _S _ _ _ X_ a a a a Xa a a a ar8   r   )rQ   r<   typingr   r   numpyrE   transformersr   r   r   r   configuration_utilsr
   utilsr   
get_loggerrN   r2   r   __all__r*   r8   r7   <module>r_      s    !    " " " " " " " "     I I I I I I I I I I I I 3 3 3 3 3 3       
	H	%	%Xa Xa Xa Xa Xa# Xa Xa Xav 
r8   