
    .`ic                     H   U d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZ erd dlmZmZ  ee          Z G d d          Z G d de          Z G d de          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z% G d  d!e%          Z& G d" d#e          Z' G d$ d%e          Z( G d& d'e          Z) G d( d)e          Z* G d* d+e          Z+ G d, d-e          Z, G d. d/e          Z-i d0e(d1ed2ed3ed4e!d5e!d6e"d7e#d8e$d9e%d:e&d;e d<e'd=ed>e)d?e*d@e*e*e,e-dAZ.e/e0e1e         f         e2dB<   dCS )D    )deepcopy)lcm)TYPE_CHECKING)init_logger)ModelRegistry)current_platform)cdivround_up)STR_DTYPE_TO_TORCH_DTYPE)AttentionBackendEnum)FullAttentionSpec	MambaSpecMLAAttentionSpec)ModelConfig
VllmConfigc                   >    e Zd Zed	d            Zed
d            ZdS )VerifyAndUpdateConfigvllm_configr   returnNc                     d S N )r   s    u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/config.pyverify_and_update_configz.VerifyAndUpdateConfig.verify_and_update_config           model_configr   c                     d S r   r   r   s    r   verify_and_update_model_configz4VerifyAndUpdateConfig.verify_and_update_model_config   r   r   r   r   r   Nr   r   r   N)__name__
__module____qualname__staticmethodr   r    r   r   r   r   r      sR           \    \  r   r   c                   &    e Zd Zedd            ZdS )Gemma3TextModelConfigr   r   r   Nc                 .    | j         }|j         |_        d S r   )	hf_configuse_bidirectional_attention	is_causal)r   r*   s     r   r    z4Gemma3TextModelConfig.verify_and_update_model_config    s     *	"+"GG	r   r"   r#   r$   r%   r&   r    r   r   r   r(   r(      s8        H H H \H H Hr   r(   c                   &    e Zd Zedd            ZdS )GteNewModelConfigr   r   r   Nc                     | j         }|j        j        dk    sJ |j        dk    sJ d|_        |j        |j        z  }t          |d|          }||z  |j        d<   ||j        |j        d|_	        d S )N	NewConfiggelugeglurotary_emb_dimpartial_rotary_factor	head_sizemax_positionrope_parameters
r*   	__class__r#   
hidden_acthidden_sizenum_attention_headsgetattrr9   max_position_embeddingsrotary_kwargsr   confighead_dim
rotary_dims       r   r    z0GteNewModelConfig.verify_and_update_model_config'       '(K7777 F****#%)CCV%5x@@
:Dx:O67!":%5 
  
r   r"   r-   r   r   r   r/   r/   &   2        
 
 
 \
 
 
r   r/   c                   &    e Zd Zedd            ZdS )$JambaForSequenceClassificationConfigr   r   r   Nc                 4    | j         }|j        	d|_        d S d S NF)pooler_configuse_activationr   rL   s     r   r    zCJambaForSequenceClassificationConfig.verify_and_update_model_config;   s)    $2'/+0M((( 0/r   r"   r-   r   r   r   rI   rI   :   s2        1 1 1 \1 1 1r   rI   c                   &    e Zd Zedd            ZdS )JinaRobertaModelConfigr   r   r   Nc                    | j         }|j        dk    ro|j        j        dk    sJ |j        |j        z  }|j        }| j        st          |d          }t          |d|          }||z  |j
        d<   |||j
        d|_        d S d S )NrotaryXLMRobertaFlashConfig   r4   r5   r6   )r*   position_embedding_typer;   r#   r=   r>   r@   enforce_eagerr
   r?   r9   rA   )r   rC   rD   r8   rE   s        r   r    z5JinaRobertaModelConfig.verify_and_update_model_configC   s    ')X55#,0GGGGG)V-GGH!9L  - 9'a88 )98DDJ>H8>SF"#:; & ,#)#9$ $F   ! 65r   r"   r-   r   r   r   rP   rP   B   s2           \  r   rP   c                   &    e Zd Zedd            ZdS )LlamaBidirectionalConfigr   r   r   Nc                     ddl m} | j        }d|_        dddd}|                    |j        d           }|t          d|j        d	          || j        _        d S )
Nr   )SequencePoolingTypeFMEANCLSLAST)avgclslastz
pool_type z not supported)	vllm.config.poolerrZ   r*   r,   getpooling
ValueErrorrL   seq_pooling_type)r   rZ   r*   pooling_type_mappooling_types        r   r    z7LlamaBidirectionalConfig.verify_and_update_model_config_   s    :::::: *	#	 <
 <
 (++I,=tDDM)*;MMMNNN6B"333r   r"   r-   r   r   r   rX   rX   ^   s8        C C C \C C Cr   rX   c                   &    e Zd Zedd            ZdS )NomicBertModelConfigr   r   r   Nc                    | j         }|j        j        dk    sJ |j        dv sJ t	          |dd          |_        |j        dk    rd|_        n|j        |_        |j        |j        cxk    r|j	        k    sn J |j	        |_
        |j        J |j        rJ |j        |_        |j        |_        |j        |_        |j        |_        |j        | j        _        |j        | j        _        |j        |j        z  }t	          |dd          }|||j        d	|_        | j        sm| j        f| j        }t;          | j        |          }|                     |          | _        | j        |k    r#t>                               d
|| j                   d S d S | j!        }tE          | j        tF                    r!| j        $                    d| j                  }n| j        }tK          |d          rtM          |d           ||_'        |j        d         |_        tQ          |          df| j        _)        tU          | j+                  }|,                    dd            || _+        |                     |          | _        d S )NNomicBertConfig)swiglur2   rU   roperl   silumax_trained_positionsi   r6   zNomic context extension is disabled. Changing max_model_len from %s to %s. To enable context extension, see: https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.htmlmax_model_lenr9   r@   max_seq_length)-r*   r;   r#   activation_functionr?   rU   r<   mlp_fc1_biasmlp_fc2_biasqkv_proj_biasbiasrotary_emb_scale_baserotary_emb_interleavedlayer_norm_epsilonlayer_norm_epsn_innerintermediate_sizen_embdr=   n_layernum_hidden_layersmodel_arch_configtotal_num_hidden_layersr>   r9   rA   hf_overridesoriginal_max_model_lenrp   minget_and_verify_max_lenloggerwarninghf_text_config
isinstancedictrb   hasattrdelattrr@   floatderived_max_model_len_and_keyr   encoder_configpop)r   rC   rD   ro   max_model_len_beforerp   r   r   s           r   r    z3NomicBertModelConfig.verify_and_update_model_configt   s   '(,=====)-?????)0-v*
 *
& %11 &F & :F"f&9QQQQV=QQQQQQQ*+3330000 & 9#)> #]#)> 5;5G&2$ 	&> %)CC '0G N N "1%5 
  
 );	3; $0#=  :<QRRM)5)L)L* *L& )-AAAw ) .     BA *8N,3T:: ; , 9 = =#\%?! ! !- : ~77 98885JN2-3-ABS-TN* +,,)LL*H &l&ABBN/666*8L')5)L)L* *L&&&r   r"   r-   r   r   r   ri   ri   s   s8        f f f \f f fr   ri   c                   &    e Zd Zedd            ZdS ) Qwen2ForProcessRewardModelConfigr   r   r   Nc                 4    | j         }|j        	d|_        d S d S )NicP )rL   step_tag_idrN   s     r   r    z?Qwen2ForProcessRewardModelConfig.verify_and_update_model_config   s)    $2$,(.M%%% -,r   r"   r-   r   r   r   r   r      s2        / / / \/ / /r   r   c                   &    e Zd Zedd            ZdS )Qwen2ForRewardModelConfigr   r   r   Nc                 4    | j         }|j        	d|_        d S d S rK   )rL   softmaxrN   s     r   r    z8Qwen2ForRewardModelConfig.verify_and_update_model_config   s)    $2 ($)M!!! )(r   r"   r-   r   r   r   r   r      s2        * * * \* * *r   r   c                   &    e Zd Zedd            ZdS )$Qwen3ForSequenceClassificationConfigr   r   r   Nc                     | j         }t          |dd          }|sd S t          |dd           }|t          |          dk    s
J d            |                                }d|_        ||_        d S )Nis_original_qwen3_rerankerFclassifier_from_token   zTry loading the original Qwen3 Reranker?, see: https://github.com/vllm-project/vllm/tree/main/examples/pooling/score/qwen3_reranker_offline.pyfrom_2_way_softmax)r*   r?   lenget_text_configmethodr   )r   rC   r   tokenstext_configs        r   r    zCQwen3ForSequenceClassificationConfig.verify_and_update_model_config   s    '%,0%&
 &
" * 	F!8$??!c&kkQ&6&6&6n '7&66 ,,..1,2)))r   r"   r-   r   r   r   r   r      s2        3 3 3 \3 3 3r   r   c                       e Zd ZdS )&Qwen3VLForSequenceClassificationConfigN)r#   r$   r%   r   r   r   r   r     s        Dr   r   c                   &    e Zd Zedd            ZdS )%JinaVLForSequenceClassificationConfigr   r   r   Nc                 P    | j         }d|_        | j        }|j        	d|_        d S d S )N   g333333@)r*   
num_labelsrL   
logit_bias)r   rC   rL   s      r   r    zDJinaVLForSequenceClassificationConfig.verify_and_update_model_config  s9    '$2#+'+M$$$ ,+r   r"   r-   r   r   r   r   r   
  s2        , , , \, , ,r   r   c                   &    e Zd Zedd            ZdS )SnowflakeGteNewModelConfigr   r   r   Nc                     | j         }|j        j        dk    sJ |j        dk    sJ d|_        |j        |j        z  }t          |d|          }||z  |j        d<   ||j        |j        d|_	        d S )N	GteConfigr2   r3   r4   r5   r6   r:   rB   s       r   r    z9SnowflakeGteNewModelConfig.verify_and_update_model_config  rF   r   r"   r-   r   r   r   r   r     rG   r   r   c                   &    e Zd Zedd            ZdS )GptOssForCausalLMConfigr   r   r   Nc                     | j         }|j        dk    rd|_        | j        }|j        +|j        &d|_        t
                              dd           d S d S d S )N openai_gptossi   z=Overriding max cuda graph capture size to %d for performance.)structured_outputs_configreasoning_parsercompilation_configcudagraph_capture_sizesmax_cudagraph_capture_sizer   info)r   r   r   s      r   r   z0GptOssForCausalLMConfig.verify_and_update_config)  s~    $/$I!$5;;9H%6
 ); 6>"=E<@9KKOQU    	 ?>EEr   r!   r#   r$   r%   r&   r   r   r   r   r   r   (  s2           \  r   r   c                   &    e Zd Zedd            ZdS )MambaModelConfigr   r   r   Nc                 l   |j         }|j        }|j        r|j        dk    r6|j        rdnd|_        t
                              d|j        |j                   |j        dk    r(|j        s!d|_        t
                              d           |j        dk    r'|j        j	        s
J d            |j
        r
J d            t
                              d|j                   |j        |j        |_        d	S d	S |j        dk    r!d|_        t
                              d
           |j        |j        |_        d	S d	S )z
        Enable FULL_AND_PIECEWISE cuda graph mode by default (required
        to get good performance for mamba layers in V1).

        Args:
            vllm_config: vLLM Config
        noneallalignzPMamba cache mode is set to '%s' for %s by default when prefix caching is enabledzHybrid or mamba-based model detected without support for prefix caching with Mamba cache 'all' mode: falling back to 'align' mode.z9Chunked prefill is required for mamba cache mode 'align'.zOMamba cache mode 'align' is currently not compatible with speculative decoding.zWarning: Prefix caching in Mamba cache '%s' mode is currently enabled. Its support for Mamba layers is experimental. Please report any issues you may observe.NzAMamba cache mode is set to 'none' when prefix caching is disabled)r   cache_configenable_prefix_cachingmamba_cache_modesupports_mamba_prefix_cachingr   r   architecturescheduler_configenable_chunked_prefillspeculative_configr   mamba_block_size
block_sizerp   )r_   r   r   r   s       r   r   z)MambaModelConfig.verify_and_update_config@  s    #/"/- 0	K,66)GTEEW - 5 1 -	   -66$B 7 18-4  
 ,77"3J  O J '9  1 9 KK< -   ,40<0G--- 54 ,6606-W   ,40<0J--- 54r   r!   r#   r$   r%   classmethodr   r   r   r   r   r   ?  s8        ;K ;K ;K [;K ;K ;Kr   r   c                   &    e Zd Zedd            ZdS )HybridAttentionMambaModelConfigr   r   r   Nc                    |j         j        }t                              |           |j        }|j         }|j        }|j        }|j        dk    r|j        }nt          |j                 }|j
        rY|j        t          j        k    }|rdnd}	t          d|                    |          |                                |          j        }
nd}	t%          j        d          r6|                                dk    r|j        |j        t          j        k    rd
}	t+          d|                    |          |                                |          j        }
t-          j        |j        |          \  }}t3          |                    |          |                    |          d          j        }|dk    rd	S |j        dk    rQ|p|                                }t=          ||
          }t?          ||	          }|t=          ||          z  }||_        n|	t=          ||	|
z            z  }|j         |j         |k     r"||_         tB          "                    d|           |j        dk    r|j         |_        |j         |
z  }||k    sJ ||k    rd	S |j#        |j#        |k    r/||_#        d||z
  z  |z  }tB          "                    d|           d	S d	S )a~  
        Ensure that page size of attention layers is greater than or
        equal to the mamba layers. If not, automatically set the attention
        block size to ensure that it is. If the attention page size is
        strictly greater than the mamba page size, we pad the mamba page size
        to make them equal.

        Args:
            vllm_config: vLLM Config
        auto   @   r   )r   num_kv_headsr7   dtype   d      N    r   )shapesdtypesr   r   r   zcSetting attention block size to %d tokens to ensure that attention page size is >= mamba page size.r   zkPadding mamba page size by %.2f%% to ensure that mamba page size and attention page size are exactly equal.)$r   r   r   r   attention_configr   parallel_configcache_dtyper   r   use_mlabackendr   CUTLASS_MLAr   get_num_kv_headsget_head_sizepage_size_bytesr   is_device_capability_family
FLASHINFERr   r   resolve_model_clsr   r   !get_mamba_state_shape_from_config!get_mamba_state_dtype_from_configr   get_mamba_chunk_sizer	   r   r   r   r   mamba_page_size_padded)r_   r   r   r   r   r   r   kv_cache_dtypeuse_cutlass_mlakernel_block_alignment_sizeattn_page_size_1_token	model_cls_mamba_page_sizebase_chunk_sizeattn_tokens_per_mamba_state
chunk_sizeattn_block_sizeattn_page_sizemamba_padding_pcts                       r   r   z8HybridAttentionMambaModelConfig.verify_and_update_config  sw    '3D11+>>>&7"/"/%5#v--)/NN5l6NON  	 (,@,LL  2A*H##b'%5)::?KK&4466$	& & &
  #" +-' <SAA
1 ..00C77$,4'/3G3RRR
 /1+%6)::?KK&4466$	& & &
  # %6%%
 
 
	1 $>>{KK>>{KK
 
 
 	 	 aF(E11" /U,2S2S2U2UO*.@V*W*W'_.IJJJ(40KZ+X+XXO,;L)) :D!<?U!U= = O "*l.E.W.W&5L#KKL   (G33,8,CL) &03II0000_,,F /72nDD2@L/~78?J  KK! "	     EDr   r!   r   r   r   r   r   r     s8        [ [ [ [[ [ [r   r   c                   &    e Zd Zedd            ZdS )DeepseekV32ForCausalLMr   r   r   Nc                 &   |j         j        }t          |d          }|sJ |j        }|j                            d          r!d|_        t                              d           |j        dk    r#d|_        t                              d           dS dS )	zQ
        Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
        
index_topkfp8
fp8_ds_mlaz1Using custom fp8 kv-cache format for DeepSeekV3.2bfloat16r   z(Using bfloat16 kv-cache for DeepSeekV3.2N)r   r*   r   r   r   
startswithr   r   )r_   r   r*   is_v32r   s        r   r   z/DeepseekV32ForCausalLM.verify_and_update_config   s    
  ,6	 L11v #/#..u55 	M'3L$KKKLLL#z11'-L$KKBCCCCC 21r   r!   r   r   r   r   r   r     s8        D D D [D D Dr   r   c                   &    e Zd Zedd            ZdS )NemotronHForCausalLMConfigr   r   r   Nc                     | j         }|j        dk    rA| j        j        }t	          |dd          }t
                              d|           ||_        dS dS )zUpdate mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
        (or not explicitly set), to the value specified in the HF config, or to
        float16 if not specified.
        r   mamba_ssm_cache_dtypefloat16z:Updating mamba_ssm_cache_dtype to '%s' for NemotronH modelN)r   r  r   r*   r?   r   r   )r   r   r*   r  s       r   r   z3NemotronHForCausalLMConfig.verify_and_update_config6  st     #/-77#0:I$+2I% %! KKL%   2GL... 87r   r!   r   r   r   r   r  r  5  s8        G G G \G G Gr   r  GteModelGteNewModelGteNewForSequenceClassificationGemma3TextModel+LlamaBidirectionalForSequenceClassificationLlamaBidirectionalModelNomicBertModelQwen2ForProcessRewardModelQwen2ForRewardModelQwen3ForSequenceClassification Qwen3VLForSequenceClassificationXLMRobertaModelJinaVLForRankingJambaForSequenceClassificationGptOssForCausalLMMambaForCausalLMMamba2ForCausalLM)FalconMambaForCausalLMr   NemotronHForCausalLMMODELS_CONFIG_MAPN)3copyr   mathr   typingr   vllm.loggerr   vllm.model_executor.modelsr   vllm.platformsr   vllm.utils.math_utilsr	   r
   vllm.utils.torch_utilsr   #vllm.v1.attention.backends.registryr   vllm.v1.kv_cache_interfacer   r   r   vllm.configr   r   r#   r   r   r(   r/   rI   rP   rX   ri   r   r   r   r   r   r   r   r   r   r   r  r  r   strtype__annotations__r   r   r   <module>r,     sc                            # # # # # # 4 4 4 4 4 4 + + + + + + 0 0 0 0 0 0 0 0 ; ; ; ; ; ; D D D D D D U U U U U U U U U U 433333333	X		       H H H H H1 H H H
 
 
 
 
- 
 
 
(1 1 1 1 1+@ 1 1 1    2   8C C C C C4 C C C*h h h h h0 h h hV/ / / / /'< / / /* * * * * 5 * * *3 3 3 3 3+@ 3 3 3,	 	 	 	 	-Q 	 	 	, , , , ,,A , , ,
 
 
 
 
!6 
 
 
(    3   .=K =K =K =K =K, =K =K =K@] ] ] ] ]&; ] ] ]@D D D D D2 D D D,G G G G G!6 G G G(=*=$= &'8= ,	=
 23K= 7= *= !"B= 4= %&J= '(N= -= == %&J= 0=  (!=" )#=$ /46)= = = 4T"7889     r   