
    .`ix3                     f    d Z ddlZddlmZ ddlmZ  ej        e          Z	 G d de          Z
dS )zNemotronH model configuration    N)PretrainedConfig)loggingc            3            e Zd ZdZdZdgZdddddd	d
ddddddddddddddddddddddddddddd ed          fdddddddddddddddf3 fd 	Zed!             Z	 xZ
S )"NemotronHConfigaz  
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has an output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    
nemotron_hpast_key_valuesi   Fi   i T  4   z4M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-          relu2g{Gz?gh㈵>T   r      Ng        @      silugMbP?g?infg-C6?   i  g      ?c4                    || _         || _        || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        t          | j                  | j        k    s
J d            t          j        d| j                  s
J d            |	|}	|	| _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | | _        |!| _         |"| _!        |#| _"        |$| _#        |%| _$        |&| _%        |'| _&        |(| _'        |)| _(        |*| _)        |+| _*        |,| _+        |-| _,        |.| _-        |/| _.        |0| _/        |1| _0        |2| _1        |3| _2         tg                      j4        d||||d|4 d S )NzBhybrid_override_pattern must have same length as num_hidden_layersz^[*-M]+$zEhybrid_override_pattern must only contain characters 'M', '*', or '-')pad_token_idbos_token_ideos_token_idtie_word_embeddings )5
vocab_sizer   hidden_sizeintermediate_sizenum_hidden_layershybrid_override_patternnum_attention_headshead_dimsliding_windowmax_position_embeddingsattention_dropouthidden_dropoutlenrematchnum_key_value_headsmlp_hidden_actattention_biasmlp_biasuse_biasinitializer_rangelayer_norm_epsilonresidual_in_fp32	use_cachenum_logits_to_keepuse_mamba_kernelsn_groupsmamba_head_dimssm_state_sizemamba_num_headsconv_kernelexpandmamba_hidden_acttime_step_mintime_step_maxtime_step_limittime_step_flooruse_conv_biasmamba_proj_bias
chunk_sizerescale_prenorm_residualn_routed_expertsn_shared_expertsmoe_intermediate_size#moe_shared_expert_intermediate_sizemoe_latent_sizenum_experts_per_tokrouted_scaling_factorn_group
topk_groupnorm_topk_probsuper__init__)6selfr   r   r   r   r   r   r    r!   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r   r   r   r"   r#   r$   r%   r3   r6   r7   mamba_n_groupsr5   mamba_d_convmamba_expandr:   mamba_dt_minmamba_dt_maxmamba_dt_limitmamba_dt_init_floormamba_conv_biasr@   mamba_chunk_sizerB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   kwargs	__class__s6                                                        ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/transformers_utils/configs/nemotron_h.pyrN   zNemotronHConfig.__init__   s*   n %#6 &!2!2'>$#6  ,'>$!2, 4/00D4JJJJP KJJ xT%ABB 	
 	
S	
 	
B
 &"5#6 ,,  !2"4 0""4!2&,,.'" 0))-2,.*(@% 0 0%:"3V0.#6 %:"$, 	
%%% 3		
 	

 	
 	
 	
 	
 	
    c                 D      fdt           j                  D             S )Nc                     g | ]=}j         |         d k    rdn'j         |         dk    rdnj         |         dk    rdnd>S )Mmamba*	attention-mlpmoe)r   ).0irO   s     r[   
<listcomp>z5NemotronHConfig.layers_block_type.<locals>.<listcomp>  s}     	
 	
 	
  +A.#55 G +A.#55  +A.#55 	
 	
 	
r\   )ranger   )rO   s   `r[   layers_block_typez!NemotronHConfig.layers_block_type  s;    	
 	
 	
 	
 4122	
 	
 	
 		
r\   )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencefloatrN   propertyrj   __classcell__)rZ   s   @r[   r   r      s%       p pd J#4"5 ! V $UU5\\* !%",0!i~
 ~
 ~
 ~
 ~
 ~
@ 

 

 X

 

 

 

 

r\   r   )rn   regexr'    transformers.configuration_utilsr   transformers.utilsr   
get_loggerrk   loggerr   r   r\   r[   <module>ry      s   $ $ #     = = = = = = & & & & & &		H	%	%A
 A
 A
 A
 A
& A
 A
 A
 A
 A
r\   