
    .`iP                     @    d dl mZ d dlZ G d dej                  ZdS )    )AnyNc                        e Zd ZU dZej        ed<   dZdZdZ		 	 	 	 	 	 	 	 	 	 	 	 dde
eef         dz  de
eef         dz  dedz  dedz  dedededededededef fdZ fdZedej        fd            Z xZS )UltravoxConfiga  
    This is the configuration class to store the configuration of a
    [`UltravoxForConditionalGeneration`]. It is used to instantiate an
    Ultravox model according to the specified arguments, defining the model
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to
    control the model outputs. Read the documentation from [`PretrainedConfig`]
    for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom audio config or dict.
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object of the text backbone.
        audio_model_id (`str`, *optional*):
            The model ID of the audio backbone.
        text_model_id (`str`, *optional*):
            The model ID of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        audio_token_index (`int`, *optional*, defaults to 32000):
            The audio token index to encode the audio prompt.
        stack_factor (`int`, *optional*, defaults to 8):
            Audio downsampling factor for the multimodal projector.
        norm_init (`float`, *optional*, defaults to 0.4):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
            use `False`, but v0.5 and above use `True`.
    wrapped_model_configultravoxz	<|audio|>FN }        皙?swiglur   audio_configtext_configaudio_model_idtext_model_idignore_indexaudio_token_indexhidden_sizestack_factor	norm_initprojector_actprojector_ln_midnum_projector_layersc                    || _         || _        || _        || _        |	| _        |
| _        || _        || _        || _        |4|pi }t          j
        |                    dd                   di || _        || _        |;d | _        |pi }t          j
        |                    dd                   di || _         t                      j        di | d S )N
model_typellamawhisper )r   r   r   r   r   r   r   r   r   transformersCONFIG_MAPPINGgetr   r   r   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                 |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/transformers_utils/configs/ultravox.pyr#   zUltravoxConfig.__init__3   s     )!2&("* 0$8! + %+K(4(Cg66) ) )) )D%
 -!"&D'-2L , ;  y99! ! !! !D 	""6"""""    c                     |dk    r|ddl m}  ||d          | _        n |dk    r|ddl m}  ||d          | _        t	                                          ||          S )Nr   r   )
get_configF)trust_remote_coder   )vllm.transformers_utils.configr*   r   r   r"   __setattr__)r$   keyvaluer*   r&   s       r'   r-   zUltravoxConfig.__setattr__`   s     /!!e&7AAAAAA(2
5E(R(R(RD%%$$$):AAAAAA *
5E J J JDww""3...r(   returnc                 4    | j                                         S )N)r   get_text_config)r$   s    r'   r   zUltravoxConfig.text_configs   s    
 (88:::r(   )NNNNr   r	   r
   r   r   r   Fr   )__name__
__module____qualname____doc__r   PretrainedConfig__annotations__r   audio_tokenis_compositiondictstrr   intfloatboolr#   r-   propertyr   __classcell__)r&   s   @r'   r   r   
   sq        ! !F '7777JKN /3-1%)$( !&%!&$%+# +#38nt++# #s(^d*+# d
	+#
 Tz+# +# +# +# +# +# +# +# "+# +# +# +# +# +#Z/ / / / /& ;\: ; ; ; X; ; ; ; ;r(   r   )typingr   r   r7   r   r   r(   r'   <module>rC      sf   
          n; n; n; n; n;\2 n; n; n; n; n;r(   