
    .`i(                     ^    d Z ddlmZ ddlmZ  ej        e          Z G d de          ZdS )zJAIS configuration    )PretrainedConfig)loggingc                   t     e Zd ZdZdZdgZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Z xZ	S )
JAISConfigaw  
    This is the configuration class to store the configuration of a
    [`JAISModel`]. It is used to instantiate a JAIS model according to the
    specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used
    to control the model outputs. Read the documentation from
    [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the JAIS model. Defines the number of different
            tokens that can be represented by the
            `inputs_ids` passed when calling [`JAISModel`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used
            with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set
            it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list
            `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in
            the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, default `True`):
            Whether to additionally scale attention weights
            by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
            and upcast attention dot-product/softmax to float() when training
            with mixed precision.
        position_embedding_type (`str`, *optional*, defaults to `"learned"`):
            Positional embedding can be either `"alibi"` or `"learned"`.
        mup_width_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale learning rate and initializers. Calculated
            as (`d_model,0 / d_model`), where
            `d_model` is the model's width and `d_model,0` is the proxy
            model's width.
        mup_embeddings_scale (`float`, *optional*, defaults to 1.0):
            muP parameter to scale token and position embeddings.
        mup_output_alpha (`float`, *optional*, defaults to 1.0):
            muP parameter to scale output logits
            (`output_logits_scale = mup_output_alpha * mup_width_scale`).
        mup_scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
            Scale attention weights by dividing by hidden_size instead of
            sqrt(hidden_size). Need to set scale_attn_weights to `True` as
            well.
        alibi_scaling (`dict`, *optional*):
            Dictionary containing the scaling configuration for ALiBi
            embeddings. Currently only supports linear
            scaling strategy. Can specify either the scaling `factor` (must be
            a float greater than 1) for fixed scaling
            or `train_seq_len` for dynamic scaling on input samples with
            sequence length > `train_seq_len`. The expected
            formats are `{"type": strategy name, "factor": scaling factor}` or
            `{"type": strategy name,
            "train_seq_len": training sequence length}`.
        architectures (`list`, *optional*, defaults to ['JAISLMHeadModel']):
            architecture names for Jais.

    Example:

    ```python
    >>> from transformers import JAISConfig, JAISModel

    >>> # Initializing a JAIS configuration
    >>> configuration = JAISConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = JAISModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```jaispast_key_valuesn_embdn_positionsn_headn_layer)hidden_sizemax_position_embeddingsnum_attention_headsnum_hidden_layersQ           Ngelu_new皙?h㈵>{Gz?TP  Flearned      ?c                    || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |                                  |dg} t3                      j        d|||d| d S )NJAISLMHeadModel)bos_token_ideos_token_idarchitectures )
vocab_sizer
   r	   r   r   n_inneractivation_functionresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangescale_attn_weights	use_cachescale_attn_by_inverse_layer_idxreorder_and_upcast_attnr   r   position_embedding_typemup_width_scalemup_embeddings_scalemup_output_alphamup_scale_qk_dot_by_dalibi_scaling_alibi_scaling_validationsuper__init__)selfr"   r
   r	   r   r   r#   r$   r%   r&   r'   r(   r)   r*   r+   r   r   r,   r-   r.   r/   r0   r1   r2   r3   r    kwargs	__class__s                              x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/transformers_utils/configs/jais.pyr6   zJAISConfig.__init__   s   : %&#6 &$$"4!2"4"/N,'>$(('>$.$8! 0%:"*&&((( ./M 	
%%'	
 	
 		
 	
 	
 	
 	
    c                 H   | j         dS t          | j         t                    rt          | j                   dk    rt	          d| j                    | j                             dd          }| j                             dd          }| j                             dd          }||dk    rt	          d|           |t          |t                    r||d	k    rt	          d
|           |t          |t                    r||dk    rt	          d|           dS dS )z=
        Validate the `alibi_scaling` configuration.
        N   zm`alibi_scaling` must be a dictionary with two fields, `type` and `factor` or `type` and `train_seq_len`, got typefactortrain_seq_lenlinearz3`alibi_scaling`'s type field must be 'linear', got r   z:`alibi_scaling`'s factor field must be a float > 1.0, got    zD`alibi_scaling`'s `train_seq_len` field must be an integer > 1, got )r3   
isinstancedictlen
ValueErrorgetfloatint)r7   alibi_scaling_typealibi_scaling_factoralibi_dynamic_scalings       r:   r4   z$JAISConfig._alibi_scaling_validation   s    %F$,d33 	s4;M7N7NRS7S7S,), ,  
 "/33FDAA#155hEE $ 2 6 6 M M%);x)G)G,), ,  
 !,3U;; -$05IS5P5P.+. .  
 "-4c:: .%16Kq6P6P<$9< <   216P6Pr;   )r   r   r   r   r   Nr   r   r   r   r   r   TTr   r   FFr   r   r   r   FNN)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr6   r4   __classcell__)r9   s   @r:   r   r      s        c cJ J#4"5#0'&	 M &(- % ) #5A
 A
 A
 A
 A
 A
F& & & & & & &r;   r   N)	rP    transformers.configuration_utilsr   transformers.utilsr   
get_loggerrM   loggerr   r!   r;   r:   <module>rY      s   &   = = = = = = & & & & & &		H	%	%X X X X X! X X X X Xr;   