
    -`i                        U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd d	lmZmZmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZK dd lLmMZM dd!lNmOZOmPZP erd d"lQmRZR d d#lSmTZT d d$lUmVZV neZReZTeZV e%eW          ZX G d% d&e          ZYd'ZZd'Z[d(d)d*e\fd+Z]d(d)d*e\fd,Z^d-d'd'd'd'd'd'd'd.e3j_        d'd/iZ`d-d0e]e^d'd'd'd'd.e3ja        d'd/iZbd-d0e]e^d'eZe[e[d.e3jc        d'd/iZdd-d0e]e^d'eZe[e[d.e3jc        d'd/iZeeYjf        e`eYjg        ebeYjh        edeYji        eeiZjeP e ed01          2           G d3 d)                                  Zkdalekdz  emd4<   daneodz  emd5<   e	 dAd6ekd7eodz  fd8            Zp ed9          d:             Zqd*ekfd;Zrd*ekdz  fd<Zs ed=          Zt	 dBd6ekd>euet         d?eveo         dz  d*eweoetf         fd@ZxdS )C    N)contextmanager)is_dataclassreplace)datetime)IntEnum)	lru_cache)Path)TYPE_CHECKINGAnyTypeVarget_args)
ConfigDictFieldmodel_validator)	dataclass)EagleModelTypes)enable_trace_function_callinit_logger)is_runai_obj_uri)random_uuid)	safe_hash   )AttentionConfig)CacheConfig)CompilationConfigCompilationModeCUDAGraphMode)DeviceConfig)ECTransferConfig)KVEventsConfig)KVTransferConfig)
LoadConfig)
LoRAConfig)ModelConfig)ObservabilityConfig)ParallelConfig)ProfilerConfig)SchedulerConfig)SpeculativeConfig)StructuredOutputsConfig)SupportsHashconfig)PretrainedConfig)QuantizationConfig)KVCacheConfigc                   (    e Zd ZdZdZ	 dZ	 dZ	 dZdS )OptimizationLevelzOptimization level enum.r   r         N)__name__
__module____qualname____doc__O0O1O2O3     d/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/config/vllm.pyr1   r1   ?   s7        ""	
B2	
B	
BO	
B))r=   r1   Fcfg
VllmConfigreturnc                 j    | j                             d          p| j                             d          S )zbEnable if either RMS norm or quant FP8 custom op is active;
    otherwise Inductor handles fusion.rms_norm	quant_fp8compilation_configis_custom_op_enabledr?   s    r>   enable_norm_fusionrI   X   s=     !66  B			4	4[	A	ABr=   c                 j    | j                             d          p| j                             d          S )zbEnable if either SiLU+Mul or quant FP8 custom op is active;
    otherwise Inductor handles fusion.silu_and_mulrD   rE   rH   s    r>   enable_act_fusionrL   a   s=     !66  B			4	4[	A	ABr=   rF   )eliminate_noopsfuse_norm_quantfuse_act_quantfuse_allreduce_rmsfuse_attn_quant	enable_spfuse_gemm_comms)pass_configcudagraph_modeuse_inductor_graph_partitionT)arbitrary_types_allowed)r,   c                      e Zd ZU dZ ed          Zeed<   	  ee          Z	eed<   	  ee
          Ze
ed<   	  eej                  Zeed<   	  ee          Zeed	<   	  ee          Zeed
<   	  ee          Zeed<   	 dZedz  ed<   	 dZedz  ed<   	  ee          Zeed<   	  ee          Zeed<   	 dZedz  ed<   	  ee          Z eed<   	  ee!          Z"e!ed<   	 dZ#e$dz  ed<   	 dZ%e&dz  ed<   	 dZ'e(dz  ed<   	  ee)          Z*e)e+z  ed<   	 dZ,e-ed<   	 e.j/        Z0e.ed<   	 de-fdZ1de2de2fdZ3e4de5fd            Z6d8dZ7e8ded
ededz  fd             Z9e8ded
ededz  fd!            Z:	 d9d"e;d#e<e-         dz  dd fd$Z=d%e>d&e-d'e>ddfd(Z?d)e)e-e>f         ddfd*Z@d8d+ZAd, ZBd-e<de<fd.ZCd/ ZDd0 ZEd1 ZFdeGdz  fd2ZHd3 ZId4 ZJ eKd56          d:d7            ZLdS );r@   zDataclass which contains all vllm-related configuration. This
    simplifies passing around the distinct configurations in the codebase.
    N)defaultmodel_config)default_factorycache_configparallel_configscheduler_configdevice_configload_configattention_configlora_configspeculative_configstructured_outputs_configobservability_configquant_configrF   profiler_configkv_transfer_configkv_events_configec_transfer_configadditional_config instance_idoptimization_levelrA   c                 F   g }g }ddl m} |                    |           | j        r|                    | j                                                   | j        rSt          | j        dd          r=| j        j        r1|                    | j        j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j	        r-|                    | j	                                                   n|                    d           | j
        r-|                    | j
                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        r,|                    | j                                                   | j        r-|                    | j                                                   n|                    d           |                    | j                                                   | j        r	 | j        r-|                    | j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        r-|                    | j                                                   n|                    d           | j        rt/          | j        x}t0                    rJt3          t5          j        |d                                          d                                          }n|                                }|                    |           n|                    d           |                    |           t3          t=          |                                          d                                          d	d
         }|S )a  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        r   )__version__compile_mm_encoderFNoneT)	sort_keys)usedforsecurityN
   )vllmrp   appendrZ   compute_hashrF   getattrmultimodal_configr\   r]   r^   r_   r`   ra   rb   rc   rd   rg   re   rf   rh   rj   rk   
isinstancedictr   jsondumpsencode	hexdigeststr)selffactorsvllm_factorsrp   rk   additional_config_hashhash_strs          r>   rx   zVllmConfig.compute_hash   sv      #%$$$$$$K((( 		( 1 > > @ @AAA'XD35I5QQX %7X
 ##D$5$G$T$T$V$VWWW''' 	( 1 > > @ @AAAA''' 	( 4 A A C CDDDD'''  	( 5 B B D DEEEE''' 	( 2 ? ? A ABBBB''' 	( 0 = = ? ?@@@@'''  	( 5 B B D DEEEE''' 	( 0 = = ? ?@@@@'''" 	( 7 D D F FGGGG''') 	O > K K M MNNN 	( 4 A A C CDDDD'''D5BBDDEEE 	" 	( 7 D D F FGGGG'''" 	( 7 D D F FGGGG'''" 	( 7 D D F FGGGG'''! 
	(t/EE+tLL J)2J0DAAAHHJJ$)* * * )++ '&
 *;)G)G)I)I& 67777'''|$$$S\\0022EJJJTTVVRC
 r=   
batch_sizec                 &    | j         j        |         S N)rF   bs_to_padded_graph_size)r   r   s     r>   pad_for_cudagraphzVllmConfig.pad_for_cudagraphY  s    
 &>zJJr=   c                 f    | j         j        dk    o!| j        du p| j        j        p| j         j         S )a  
        Determine if the DPCoordinator process is needed.

        The DPCoordinator is needed in two cases:
        1. For MoE models with DP > 1: to handle wave coordination
           (even in external LB mode, since wave coordination runs in the coordinator)
        2. For non-MoE models in internal/hybrid LB mode: to collect and publish
           queue stats for load balancing across DP ranks

        Returns:
            True if DPCoordinator process is needed, False otherwise.
        r   N)r]   data_parallel_sizerZ   is_moedata_parallel_external_lbr   s    r>   needs_dp_coordinatorzVllmConfig.needs_dp_coordinator`  sH    " #6: 
% B 'B'AA	
r=   c                 *   t           j        rt          j                    }t          j                            |t          j                              }dt	          j	                     dt          j                     dt          j                     d                    dd          }t          j                            |dd| j         |          }t	          j        t          j                            |          d	
           t%          |           dS dS )z
        Set up function tracing for the current thread,
        if enabled via the `VLLM_TRACE_FUNCTION` environment variable.
         VLLM_TRACE_FUNCTION_for_process__thread__at_z.log _rv   zvllm-instance-T)exist_okN)envsVLLM_TRACE_FUNCTIONtempfile
gettempdirospathjoingetpassgetusergetpid	threading	get_identr   nowr   rm   makedirsdirnamer   )r   tmp_dirfilenamelog_paths       r>   %enable_trace_function_call_for_threadz0VllmConfig.enable_trace_function_call_for_threadw  s   
 # 	1)++Ggll7GO,=,=>>GK29;; K K$.00K K6>lnnK K Kgc3  w||3!133	 H K11DAAAA&x00000	1 	1r=   c           	         ddl m} | j        ddlm}  || |          }|                                }|\|                                }||                                k     r0t          d| j         d|                                 d| d          |	                                }| j
        |vr!t          | j
         d	| j         d
|           |                    | j                   |S dS )zGet the quantization config.r   current_platformN)get_quant_configzThe quantization method z; is not supported for the current GPU. Minimum capability: z. Current capability: .z* is not supported for quantization method z. Supported dtypes: )vllm.platformsr   quantization-vllm.model_executor.model_loader.weight_utilsr   get_device_capabilityto_intget_min_capability
ValueErrorget_supported_act_dtypesdtypemaybe_update_configmodel)rZ   r`   r   r   rf   capability_tuple
capabilitysupported_dtypess           r>   _get_quantization_configz#VllmConfig._get_quantization_config  s\   
 	433333$0VVVVVV++L+FFL/EEGG+-4466
 ? ? A AAA$=<3L = ='3'F'F'H'H= = 0:= = =    ,DDFF!)999 #) * **7* *'* *  
 ,,\-?@@@tr=   c                 \    dd l }t                               |j        |           |          S )Nr   )copyr@   r   deepcopy)rZ   r`   r   s      r>   get_quantization_configz"VllmConfig.get_quantization_config  s8     	 22DM,''
 
 	
r=   	hf_configarchitecturesc                     |t          j        |          }||_        t          j        | j                  }||_        |                                |_        t          | |          S )N)rZ   )r   r   r   rZ   r   get_model_arch_configmodel_arch_configr   )r   r   r   rZ   s       r>   with_hf_configzVllmConfig.with_hf_config  sb    
 $i00I&3I#}T%677!*)5)K)K)M)M&t,7777r=   
config_objkeyvaluec                     t          ||          -t          ||t          |          r ||           n|           dS dS )zSet config attribute to default if not already set by user.

        Args:
            config_obj: Configuration object to update.
            key: Attribute name.
            value: Default value (static or callable).
        N)ry   setattrcallable)r   r   r   r   s       r>   _set_config_defaultzVllmConfig._set_config_default  sM     :s##+
 JHUOO%NUU4[[[OOOOO ,+r=   defaultsc                 t     dt           dt          t          t           f         ddf fd  |           dS )a5  Apply optimization level defaults using self as root.

        Recursively applies values from defaults into nested config objects.
        Only fields present in defaults are overwritten.

        If the user configuration does not specify a value for a default field
        and if the default field is still None after all user selections are
        applied, then default values will be applied to the field. User speciied
        fields will not be overridden by the default.

        Args:
            defaults: Dictionary of default values to apply.
        r   config_defaultsrA   Nc                    |                                 D ]n\  }}t          | |          st          | |          }t          |t                    rt          |          r ||           W                    | ||           odS )z=Recursively apply defaults to config_obj, using self as root.N)itemshasattrry   r{   r|   r   r   )r   r   r   r   currentapply_recursiver   s        r>   r   zFVllmConfig._apply_optimization_level_defaults.<locals>.apply_recursive  s    -3355 E E
Uz3// !*c22eT** E|G/D/D E#OGU3333,,ZeDDDDE Er=   )r   r|   r   )r   r   r   s   ` @r>   "_apply_optimization_level_defaultsz-VllmConfig._apply_optimization_level_defaults  si    
	E 
	Ed38n 
	EQU 
	E 
	E 
	E 
	E 
	E 
	E 
	E 	h'''''r=   c                 v   | j         j        x}dS | j         j        }| j        t	                      | _        | j        j        | j        j        z  }|dk    r1d| j        _        | j        j	        
                    d|dz  i           n&|dk    r d| j        _        ||z  }d|d	| j        _	        d
| j        _        dS )zUpdate KVTransferConfig based on top-level configs in VllmConfig.

        Right now, this function reads the offloading settings from
        CacheConfig and configures the KVTransferConfig accordingly.
        NnativeOffloadingConnectorcpu_bytes_to_usei   @lmcacheLMCacheConnectorV1T)zlmcache.local_cpuzlmcache.max_local_cpu_sizekv_both)r\   kv_offloading_sizekv_offloading_backendrh   r!   r]   tensor_parallel_sizepipeline_parallel_sizekv_connectorkv_connector_extra_configupdatekv_role)r   r   r   num_kv_rankskv_gb_per_ranks        r>   _post_init_kv_transfer_configz(VllmConfig._post_init_kv_transfer_config  s     #'"3"FFOF $ 1 G "*&6&8&8D# 5"9: 	
 !H,,3HD#0#=DD#%77%CD    #i//3GD#0/,>N%).<A AD#= +4'''r=   c                     t          j                      _                                           j        T j                             j                    j                             j                    j        j	         j        _
         j                             j                    j         j                             j                    j        1 j        *t                               j         j                   _         j        j        }|dv } j        j        rg j        J j        j        t-          t.                    vrt1          d           j        j        rt1          d          |st1          d| d          nۉ j        j        ω j        T j        j        t-          t.                    vr4t4                              d j        j        d	           d
 j        _        nt j        5 j        j        r)t4                              dd	           d
 j        _        n8|s*t4                              d|d	           d
 j        _        nd j        _        t4                              d j        j        rdnd            j        j        d j        j        rL j        j        dk    r/ j         j        j	        rt4                              dd	           d j        _        nd
 j        _        ddlm }  j        X j        j!        rL j        j"        tF          j$        k    r2|%                                dk    rt4                              d            j&        tN          j(        k    r> j        7 j        j)        r+t4          *                    d           tN          j(         _&         j+        j,        dk    s& j+        j-        4 j+        j-        t\          j/        k    rt4          *                    d            fd} |            r% j+        j0        }d|vr|1                    d            j+        j-        B j&        tN          j(        k    rt\          j/         j+        _-        nt\          j2         j+        _-        tg           fddD                       ri j+        j,        dk    r: j+        j-        t\          j2        k    r  j+        j0        1                    d            n j+        j0        1                    d!           th           j&                 } 5                    |            j+        j6        7                                r` j+        j-        t\          j/        k    rFt4          8                    d" j+        j6         j+        j-                   tr          j2         j+        _6         j+        j:        j;        rd j+        j:        _<         j+        j:        j<        rHd# j+        j0        v rt4          *                    d$           n j+        j0        1                    d%           |=                                rG j        x}r j+        j6        >                                r8|j?        1t4                              d&           tr          j@         j+        _6        nk|jA        rd j+        j6        tr          j2        tr          jB        fvr@t4                              d' j+        j6        jC                   tr          jB         j+        _6         j        U j        j)        rIt4          8                    d(           tr          j2         j+        _6        d j+        _D        g  j+        _E        nd j+        _F         G                                 ntr          j2         j+        _6         j        jH        rI j        ( j        I                                rt1          d)          t4                              d*            J                                  j        rM j        jK        d+k    r=t          jM        N                    d,          d-k    rt4          *                    d.            jO        2 jO        jP        r& j        jQ        st4          *                    d/            jO        6 jO        jR        d0k    r& jO        jP        st4          *                    d1           |S                                 j        jT        dk    r j        jU        dk    rJ j        jV         j        jU        k    r0 j        jU         j        _V        t4                              d2            j        jV         j        jW        k    r j        jW         j        jV        z  dk    s%J d3 j        jW         d4 j        jV         d5             j         j        j	        r j        j        nd} j+        X                     j        jY        |6            j+        j:        j<        rɉ j+        j-        t\          j/        k    r%t4          *                    d7 j+        j-                    j+        jZ        pt           j+        j\                  dk    }	 j        j]        dk    s|	sOd# j+        j0        vr  j+        j0        1                    d%           n!|	sd8nd9}
t4                              d:|
           |^                                r j+        j6        >                                rK j        D j        j_        s8 j+        j6        `                                st4                              d;            j+        j6        7                                r1 j+        j-        t\          j/        k    sJ d< j+        j6                      j        ja        rP j        jY        }|d=v sJ d>| d?             j        j_        s&d j        __        t4                              d@            j        st                      ddA          _        d
}|c                                sd} jO        d} j        W j        jd        K j         j        I                                rd}n(t          jf        st4          *                    dB           d} j        jg        0 jh        d}t4          *                    dC           | j        _g        n j        jg        d
u r|rt1          dD           j        jg        d
 j        _g         j        ji        dEk    rB j        jj        dk    r j        jj         j        jW        k    sJ  j        jk        r
J dF             j+        jl        r: j+        jl        m                                n                                 j+        _l        t          jo        pt          t          jo                  m                                n                                } j+        jl        rt4          *                    dG|           | j+        _l         fdH} |            r% j+        j0        }d|vr|1                    d            q                                 dS )Iz6Verify configs are valid & consistent with each other.N)mpuniexternal_launcherzZCurrently, async scheduling is only supported with EAGLE/MTP kind of speculative decoding.zJAsync scheduling is not compatible with disable_padded_drafter_batch=True.z{Currently, async scheduling only supports `mp`, `uni`, or `external_launcher` distributed executor backend, but you chose `z`.zWAsync scheduling not supported with %s-based speculative decoding and will be disabled.local)scopeFz_Async scheduling is not compatible with disable_padded_drafter_batch=True and will be disabled.zAsync scheduling will be disabled because it is not supported with the `%s` distributed executor backend (only `mp`, `uni`, and `external_launcher` are supported).TzAsynchronous scheduling is %s.enableddisabledr   zBDisabling NCCL for DP synchronization when using async scheduling.r   r   )      zTuring devices tensor cores do not support float32 matmul. To workaround this limitation, vLLM will set 'ieee' input precision for chunked prefill triton kernels.z7Enforce eager set, overriding optimization level to -O0eagerzInductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.c                       j         Qt           j         d          r j         j        d uS t           j         d          r j                                         S dS Nweight_block_sizehas_blocked_weightsFrf   r   r   r   r   s   r>   r   z5VllmConfig.__post_init__.<locals>.has_blocked_weights  e     ,4,.ABB C,>dJJT.0EFF C,@@BBB5r=   z
-quant_fp8z
+quant_fp8c              3   4   K   | ]}|j         j        vV  d S r   )rF   
custom_ops).0sr   s     r>   	<genexpr>z+VllmConfig.__post_init__.<locals>.<genexpr>  s.      TTqq/::TTTTTTr=   )allnoneinductorr  r  zPCudagraph mode %s is not compatible with compilation mode %s.Overriding to NONE.z	-rms_normz9RMS norm force disabled, sequence parallelism might breakz	+rms_normzVPooling models do not support full cudagraphs. Overriding cudagraph_mode to PIECEWISE.zXEncoder-decoder models do not support %s. Overriding cudagraph_mode to FULL_DECODE_ONLY.z&Cudagraph is disabled under eager modezFast prefill optimization for KV sharing is not compatible with EAGLE as EAGLE requires correct logits for all tokens while fast prefill gives incorrect logits for prompt tokens.zh--kv-sharing-fast-prefill requires changes on model side for correctness and to realize prefill savings.WhisperForConditionalGenerationVLLM_WORKER_MULTIPROC_METHODspawnzWhisper is known to have issues with forked workers. If startup is hanging, try setting 'VLLM_WORKER_MULTIPROC_METHOD' to 'spawn'.zaKV cache events are on, but prefix caching is not enabled. Use --enable-prefix-caching to enable.nullzKV cache events are disabled, but the scheduler is configured to publish them. Modify KVEventsConfig.enable_kv_cache_events to True to enable.zcp_kv_cache_interleave_size is overridden by dcp_kv_cache_interleave_size. And dcp-kv-cache-interleave-size will be deprecated when PCP is fully supported.zBlock_size(zS) should be greater than or equal to and divisible by cp_kv_cache_interleave_size (z).)all2all_backendr   zLSequence parallelism is enabled, but running in wrong vllm compile mode: %s.zDynamo partitionzpipeline parallelismziSequence parallelism not supported with native rms_norm when using %s, this will likely lead to an error.zNo piecewise cudagraph for executing cascade attention. Will fall back to eager execution if a batch runs into cascade attentions.zyCompilation mode should be CompilationMode.VLLM_COMPILE when cudagraph_mode piecewise cudagraphs is used, cudagraph_mode=)deepep_low_latencydeepep_high_throughputziMicrobatching currently only supports the deepep_low_latency and deepep_high_throughput all2all backend. z is not supported. To fix use --all2all-backend=deepep_low_latency or --all2all-backend=deepep_high_throughput and install the DeepEP kernels.z0Disabling cascade attention when DBO is enabled.r   zThere is a latency regression when using chunked local attention with the hybrid KV cache manager. Disabling it, by default. To enable it, set the environment VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1.a  Turning off hybrid kv cache manager because `--kv-transfer-config` is set. This will reduce the performance of vLLM on LLMs with sliding window attention or Mamba attention. If you are a developer of kv connector, please consider supporting hybrid kv cache manager for your connector by making sure your connector is a subclass of `SupportsHMA` defined in kv_connector/v1/base.py and use --no-disable-hybrid-kv-cache-manager to start vLLM.zHybrid KV cache manager was explicitly enabled but is not supported in this configuration. Consider omitting the --no-disable-hybrid-kv-cache-manager flag to let vLLM decide automatically.alignzChunked MM input is required because we need the flexibility to schedule a multiple of block_size tokens even if they are in the middle of a mm inputzLConfig-specified debug dump path is overridden by VLLM_DEBUG_DUMP_PATH to %sc                       j         Qt           j         d          r j         j        d uS t           j         d          r j                                         S dS r   r   r   s   r>   r   z5VllmConfig.__post_init__.<locals>.has_blocked_weights	  r   r=   )rtimetime_nsrm   try_verify_and_update_configrZ   verify_with_parallel_configr]   "verify_dual_chunk_attention_configr`   r   is_moe_modelr\   rb   verify_with_model_configrf   r@   r   distributed_executor_backendr^   async_schedulingrc   methodr   r   r   disable_padded_drafter_batchloggerwarning_once	info_once#disable_nccl_for_dp_synchronizationr   r   r   enable_chunked_prefillr   torchfloat32r   rn   r1   r8   enforce_eagerwarningrF   backendmoder   VLLM_COMPILEr   rw   NONEr  OPTIMIZATION_LEVEL_TO_CONFIGr   rU   requires_piecewise_compilationinfor   rT   rS   rR   support_static_graph_modehas_full_cudagraphspooler_config	PIECEWISEis_encoder_decoderFULL_DECODE_ONLYnamemax_cudagraph_capture_sizecudagraph_capture_sizescudagraph_num_of_warmups_set_cudagraph_sizeskv_sharing_fast_prefill	use_eagle_set_compile_rangesarchitecturer   environgetri   enable_kv_cache_eventsenable_prefix_caching	publishercheck_and_update_configdecode_context_parallel_sizedcp_kv_cache_interleave_sizecp_kv_cache_interleave_size
block_sizeset_splitting_ops_for_v1r	  rV   lensplitting_opsr   is_cuda_alikedisable_cascade_attnhas_piecewise_cudagraphsuse_ubatchingr   support_hybrid_kv_cacheattention_chunk_sizer   2VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHEdisable_hybrid_kv_cache_managerrh   mamba_cache_modelong_prefill_token_thresholddisable_chunked_mm_inputdebug_dump_pathabsolute
expanduserVLLM_DEBUG_DUMP_PATHr	   r   )r   executor_backendexecutor_supports_async_schedr   r   r   default_configrZ   effective_dp_sizeis_fullgraphregimea2a_backend$need_disable_hybrid_kv_cache_managerenv_paths   `             r>   __post_init__zVllmConfig.__post_init__  s    #lnn.))+++(99$:NOOO@@AQRRR040A0HD -55d6JKKK'55d6GHHH$):)F * C C!4#3! !D  /L(8 =
 )
%  1 6	> &2*1/9R9RRR$G   *G $=   1  -(- - -   "3; '3+2(?:S:SSS##A+2!	 $    :?%66'3+H 4 ##N! $   
 :?%662 
>##: %! $    :?%669=%6,.?OIIZ	
 	
 	

 CK$5 Q':Q>>%-1B1I-$$7% %   
 LP$HHKP$H333333 )%< *!'5=88 6688FBB@   #&7&:::!-!/ . NNTUUU&7&:D#"*g55#(4',0LLLNN8  	 	 	 	 	    	00;J:--!!,///"'/&):)===/>/K',,/>/C',TTTTOTTTTT 	A'/:==+0O4HHH'299&AAAA'299%@@@5d6MN//??? #2QQSS
	H',0LLLKK&'6',	   6C5GD#2 ".> 	A<@D#/9".8 	Gd5@@@O    '299+FFF5577 &	H#00| +:NNPP$2>''B   >K=TD+:: 3/>).0NOP P $$I/>C   &6 +:
  ,1B1P,DEEE9F9K'6EF'BBD'??CD'@%%''''5B5GD#24 	'3+5577 4 !)   >  
 	  """ 
	!.2SSS
=>>'IINN   !-%< .%; . NN9  
 !-%/699)@ : NN%   	00666 <q@@#@1DD$@'DE E (E $@ ##>   $@$/0 0%0&BC  Id/: I I(DI I I	   (D,=,D(  33 	
 	88 0@0 	9 	
 	
 	

 ".8 	 &+/KKK-+0   'D Ct.<==B  #:Q>>l>d&=&HHH+6==kJJJJ  ,4**3 
 ''= 	   ))++ 	'6JJLL
%1)> 2/>WWYY 2 ##/   &5TTVV .37SSSSO&*&=&LO O TSS - 	X.>K #   ;F  	   $9 X9=!6##$VWWW 	1*}}RaR0D 05,  7799 	8370 ,370)!6B '3+5577 4
 8<44L 	<L   8<4 @H&27;4O	 	 	 5 !AA !AUJJ4 K "    @HDID!A-88$AAEE)F(34 4 4 4 ,E  ' E
 "2 	'7@@BBMMOO #3 $0D566??AALLNNH&6 5  
 7?D#3	 	 	 	 	    	00;J:--!!,/// 	**,,,,,r=   possible_sizesc                       fd|D             }|r&t                               d| j        j                    fd|D             S )Nc                 :    g | ]}|j         j        z  d k    |S r   r]   r   r   sizer   s     r>   
<listcomp>zDVllmConfig.update_sizes_for_sequence_parallelism.<locals>.<listcomp>   s9     
 
 
d*??1DD DDDr=   zkBatch sizes %s are removed because they are not multiple of tp_size %d when sequence parallelism is enabledc                 :    g | ]}|j         j        z  d k    |S ra  rb  rc  s     r>   re  zDVllmConfig.update_sizes_for_sequence_parallelism.<locals>.<listcomp>.  s9     
 
 
d*??1DD DDDr=   )r  r!  r]   r   )r   r^  removed_sizess   `  r>   %update_sizes_for_sequence_parallelismz0VllmConfig.update_sizes_for_sequence_parallelism  s    
 
 
 
&
 
 

  	NN2 $9  
 
 
 
&
 
 
 	
r=   c           
         | j         | j         j        s| j        j        t          j        k    r| j        j        Dd}| j        r| j        j        r|| j        j        z  }t          | j
        j        |z  dz  d          | j
        j        t                    dk    s
J d            | j        j        pt          | j        j                  dk    s
J d            t          t!          | j        j                            }fd|D             }|                                 npfd	d
D             }dk    r2|t          t%          dt          dz   d          d                    z  }dk    r$|t          t%          ddz   d                    z  }| j        j        dk    r&| j        j        j        r|                     |          }|r|d         nd}| j        j        W| j        j        |k    rG| j        j         t1          d| j        j         d| d          t2                              d|           || j        _        | j        j        Pt          |          t          | j        j                  k     r&t2                              d| j        j        |           || j        _        nd| j        _        g | j        _        | j                                         dS )a  
        vLLM defines the default candidate list of batch sizes for CUDA graph
        capture as:

        ```python
        max_graph_size = min(max_num_seqs * 2, 512)
        # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
        # up to max_graph_size
        cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
            range(256, max_graph_size + 1, 16))

        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
        will be the final sizes to capture cudagraph (in ascending order).

        These sizes are used to capture and reuse CUDA graphs for
        performance-critical paths (e.g., decoding). Capturing enables
        significantly faster kernel dispatch by avoiding Python overhead. The
        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
        most GPUs), which controls the total allowed number of tokens in a
        batch. Since each sequence may have a variable number of tokens, the
        maximum usable batch size will depend on actual sequence lengths.

        Example:
            With `max_num_batched_tokens = 8192`, and typical sequences
            averaging ~32 tokens, most practical batch sizes fall below 256.
            However, the system will still allow capture sizes up to 512 if
            shape and memory permit.

        Note:
            If users explicitly specify cudagraph capture sizes in the
            compilation config, those will override this default logic.
            At runtime:

            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
            padded CUDA graph will be used.
            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
            not be used.
        Nr   r2   i   zRMaximum cudagraph size should be greater than or equal to 1 when using cuda graph.r   zRcudagraph_capture_sizes should contain at least one element when using cuda graph.c                      g | ]
}|k    |S r<   r<   )r   imax_num_tokenss     r>   re  z3VllmConfig._set_cudagraph_sizes.<locals>.<listcomp>  s*     + + +a>.A.AA.A.A.Ar=   c                      g | ]
}|k    |S r<   r<   )r   rk  r0  s     r>   re  z3VllmConfig._set_cudagraph_sizes.<locals>.<listcomp>  s+     + + +A1K,K,KA,K,K,Kr=   )r   r2               z'customized max_cudagraph_capture_size(=zF) should be consistent with the max value of cudagraph_capture_sizes(=)z+Truncating max_cudagraph_capture_size to %dzUcudagraph_capture_sizes specified in compilation_config %s is overridden by config %s)rZ   r   rF   rU   r   r%  r0  rc   num_speculative_tokensminr^   max_num_seqsmax_num_batched_tokensr1  rC  listsetsortranger]   r   rT   rR   rh  r   r  r!  post_init_cudagraph_sizes)r   decode_query_lendedup_sizesr1  valid_max_sizer0  rl  s        @@r>   r3  zVllmConfig._set_cudagraph_sizes4  s   R )%3 *'6-:LLL 'B ' *1#$ +W/FW %(?(VV$-0)69IIAMs. .* "2IN),^=W)X)X&-222) 322 &>J42JKKaOOO- POO
 #3t'>'V#W#WXX+ + + +*+ + +' (,,....+ + + +(+ + +' .22+ta%?!%CS!I!I1MM0 0 + .44+tc#=#A2FF0 0 +
 $9A==+7A > +/*T*T++ +' 0GM'++A  'BN+F.XX *BN$F!4OF F 5CF F F   A"  
 BPD#>&>Js'P PD+CDDPE PE 9 +C+   ?VD#;; BCD#>>@D#; 	99;;;;;r=   c                    | j         }g }| j        j        }|H| j        duo| j                                        }|r|| j        j        z  }|                    |           |j        j        r| j	        j
        }|j                            |          }|f|| j                                        | j        j        j        z  z  }|||k     r|                    |           nt                               d           |j        W|j        D ]O}t'          |t(                    sJ |dk    sJ d|             |!||k     r|dk    r|                    |           Pt+          |          |_        dS )zD
        Set the compile ranges for the compilation config.
        NzuMax num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens.r   z#Invalid compile range split point: r   )rF   r^   rw  rc   uses_draft_modelrv  rw   rT   rP   r]   r   flashinfer_max_sizerZ   get_hidden_sizer   itemsizer  debugcompile_ranges_split_pointsr{   intsorted)	r   rF   $computed_compile_ranges_split_pointscompile_range_end	do_extendtp_sizemax_sizemax_token_numxs	            r>   r6  zVllmConfig._set_compile_ranges  s    "4/1,
 !1H('t3 ?+<<>>   H!T%:%GG!0778IJJJ )< 	*?G)5II'RRH# (%5577'-67! %0]EV5V5V8??NNNNLLS  
 9E'C C C!!S)))))1uuuGAGGuuu$0Q9J5J5JqSTuu8??BBB9?0:
 :
666r=   c                    | j         d S t          | j         dd          rd S d| j         _        | j         j        }|d S ddlm}m} |                    |d           }||                    |            | j         j	        r|                    |            | j         j
        dk    rddlm} |                    |            t          | j         d          rt          | j         j                  rq| j        j        d	k    r(t$                              d
           d| j        _        d S | j        j        dvr-t)          d| j        j         d| j         j                   d S d S d S )Nconfig_updatedFTr   )MODELS_CONFIG_MAPHybridAttentionMambaModelConfigclassify)SequenceClassificationConfigmodel_weightsautozJDetected Run:ai model config. Overriding `load_format` to 'runai_streamer'runai_streamer)r  runai_streamer_shardedzfTo load a model from S3, 'load_format' must be 'runai_streamer' or 'runai_streamer_sharded', but got 'z
'. Model: )rZ   ry   r  r7  !vllm.model_executor.models.configr  r  r9  verify_and_update_config	is_hybridconvert_type#vllm.model_executor.models.adaptersr  r   r   r  r`   load_formatr  r(  r   r   )r   r7  r  r  clsr  s         r>   r  z'VllmConfig.try_verify_and_update_config  s   $F 4$&6>> 	F+/((5F	
 	
 	
 	
 	
 	
 	
 	

  ##L$77?((...& 	K+DDTJJJ)Z77XXXXXX(AA$GGG4$o66 	;K+<
 <
 	 +v55C   0@ ,,,!- 6   !8 $ 0 <8 8 #/58 8  	 	 	 	 r=   c                     | j         j        dS | j        j        }| j        j        }d| d| }| j         j        |z  }|S )zWReturns a rank-aware path for dumping
        torch.compile debug information.
        Nrank__dp_)rF   rP  r]   rankdata_parallel_index)r   tp_rankdp_rankappend_pathr   s        r>   compile_debug_dump_pathz"VllmConfig.compile_debug_dump_path1  sU     "2:4&+&:4g44744&6Dr=   c                     t          | fi |S )z
        Replace attributes of the config, and 'recompute' the config.
        dataclass.replace() calls __init__() and __post_init__(), source:
        https://docs.python.org/3/library/dataclasses.html#dataclasses.replace
        )r   )r   kwargss     r>   r   zVllmConfig.replace=  s     t&&v&&&r=   c                 j   d                     g d| j        j        d| j        d| j        j        d| j        j         d| j        j         d| j        j         d| j        j         d	| j        j	         d
| j        j
         d| j        j         d| j        j        d| j        j         d| j        j         d| j        j         d| j        j         d| j        j         d| j        j         d| j        j         d| j        j         d| j        j         d| j        j         d| j        d| j        d| j        j         d| j        j         d| j        j         d| j         j!         d| j        j"        d| j#                  S )Nrl   zmodel=z, speculative_config=z, tokenizer=z, skip_tokenizer_init=z, tokenizer_mode=z, revision=z, tokenizer_revision=z, trust_remote_code=z, dtype=z, max_seq_len=z, download_dir=z, load_format=z, tensor_parallel_size=z, pipeline_parallel_size=z, data_parallel_size=z, disable_custom_all_reduce=z, quantization=z, enforce_eager=z, enable_return_routed_experts=z, kv_cache_dtype=z, device_config=z, structured_outputs_config=z, observability_config=z, seed=z, served_model_name=z, enable_prefix_caching=z, enable_chunked_prefill=z, pooler_config=z, compilation_config=)$r   rZ   r   rc   	tokenizerskip_tokenizer_inittokenizer_moderevisiontokenizer_revisiontrust_remote_coder   max_model_lenr`   download_dirr  r]   r   r   r   disable_custom_all_reducer   r   enable_return_routed_expertsr\   cache_dtyper_   devicerd   re   seedserved_model_namer;  r^   r  r+  rF   r   s    r>   __str__zVllmConfig.__str__E  s   > > > > > >T&, > > > >"&"9> > > >*4> > > > $(#4#H> > > > #/>	> > > >
 )2> > > > #'"3"F> > > > "&!2!D> > > > &,> > > >  ,:> > > > !,9> > > >  +7> > > > %)$8$M> > > > '+&:&Q> > > > #'"6"I> > > > *.)=)W> > > >  !-:!> > > >" ".<#> > > >$ -1,=,Z%> > > >& #/;'> > > >( "/6)> > > >* *.)G+> > > >, %)$=-> > > >. %*/> > > >0 "&!2!D1> > > >2 &*%6%L3> > > >4 '+&;&R5> > > >6 ".<7> > > >8 #'"99> > > >	
r=   after)r#  c                     | j         | S | j        j        d uo| j        j        | j         j        k    }|r| j        j        st          d          | S )Nz?--mamba-block-size can only be set with --enable-prefix-caching)rZ   r\   mamba_block_sizer  r;  r   )r   mamba_block_size_is_sets     r>   validate_mamba_block_sizez$VllmConfig.validate_mamba_block_sizef  sp    $K.d: V!2d6G6UU 	  # 	4+<+R 	Q   r=   )rA   Nr   )rA   r@   )Mr4   r5   r6   r7   r   rZ   r$   __annotations__r   r\   r&   r]   r(   r[   r^   r   r_   r"   r`   r   ra   rb   r#   rc   r)   r*   rd   r%   re   rf   r.   r   rF   r'   rg   rh   r!   ri   r    rj   r   r|   rk   r+   rm   r   r1   r:   rn   rx   r  r   propertyboolr   r   staticmethodr   r   r-   rx  r   r   r   r   r   r]  rh  r3  r6  r  r	   r  r   r  r   r  r<   r=   r>   r@   r@      s=          !&d 3 3 3L+333 %k B B BL+BBB&+eN&K&K&KO^KKK!(-'7) ) )o    #"'%"E"E"EM<EEE#eJ???K???(-o(N(N(NoNNN"%)Kd")))37)D0777-9>/: : :6    ,05+1 1 1-    '.2L$t+222%,1EBS,T,T,T)TTT ',eN&K&K&KO^KKK"26(4/666?.2nt+222226(4/666? .3U4-H-H-Htl*HHH1 K&,=,@)@@@
bc b b b bHKC KC K K K K 
d 
 
 
 X
,1 1 1 1, !0:	d	"   \@ 	
!	
0:	
	d	"	
 	
 	
 \	
 +/8 8#8 Cy4'8 
	8 8 8 8Pc P PC PD P P P P(4S> (d ( ( ( (:"4 "4 "4 "4HB- B- B-H
D 
T 
 
 
 
.X< X< X<t.
 .
 .
`1 1 1f
 
 
 
 
' ' '
 
 
B _'"""   #"  r=   _current_vllm_config_current_prefixvllm_configprefixc              #     K   t           }t          }ddlm} |j        }	 t
                                           | a |adV  |r| j                                         |rJ| j        j	        t          j        k    r0|j        |k    r%t                              d| j        j                   n# t           $ r  w xY w|a |at
                                           dS # |a |at
                                           w xY w)a  
    Temporarily set the current vLLM config.
    Used during model initialization.
    We save the current vLLM config in a global variable,
    so that all modules can access it, e.g. custom ops
    can access the vLLM config to determine how to dispatch.
    r   )compilation_counterNz`torch.compile` is turned on, but the model %s does not support it. Please open an issue on GitHub if you want it to be supported.)r  r  vllm.compilation.counterr  num_models_seenget_cached_compilation_configcache_clearrF   custom_op_log_checkr#  r   r$  r  r!  rZ   r   	Exception)r  check_compiler  old_vllm_config
old_prefixr  r  s          r>   set_current_vllm_configr  y  s9      +O J<<<<<<)9O#4 	&11333*   	A*>>@@@ 	.37SSS#3FF NN3 (.	  !    .  /$%1133333  /$%113333s   !B(  A(C (B44C C5)maxsizec                  (    t                      j        S )zACache config to avoid repeated calls to get_current_vllm_config())get_current_vllm_configrF   r<   r=   r>   r  r    s     #$$77r=   c                  <    t           t          d          t           S )Nab  Current vLLM config is not set. This typically means get_current_vllm_config() was called outside of a set_current_vllm_config() context, or a CustomOp was instantiated at module import time or model forward time when config is not set. For tests that directly test custom ops/modules, use the 'default_vllm_config' pytest fixture from tests/conftest.py.)r  AssertionErrorr<   r=   r>   r  r    s(    #K
 
 	
  r=   c                      t           S r   )r  r<   r=   r>   get_current_vllm_config_or_noner    s    r=   T
layer_typelayer_namesc                     |+t          | j        j                                                  }| j        j        fd|D             S )z
    Get layers from the vLLM config.

    Args:
        vllm_config: The vLLM config.
        layer_type: The type of the layer to get.
        layer_names: The names of the layers to get. If None, return all layers.
    Nc                 N    i | ]!}t          |                   ||         "S r<   )r{   )r   
layer_nameforward_contextr  s     r>   
<dictcomp>z/get_layers_from_vllm_config.<locals>.<dictcomp>  sE       oj1:>>OJ/  r=   )rx  rF   static_forward_contextkeys)r  r  r  r  s    ` @r>   get_layers_from_vllm_configr    sf     ;9PUUWWXX!4KO    %   r=   )FNr   )yr   r   r}   r   r   r   r  
contextlibr   dataclassesr   r   r   enumr   	functoolsr   pathlibr	   typingr
   r   r   r   r  pydanticr   r   r   pydantic.dataclassesr   	vllm.envsr   vllm.config.speculativer   vllm.loggerr   r   #vllm.transformers_utils.runai_utilsr   
vllm.utilsr   vllm.utils.hashingr   	attentionr   cacher   compilationr   r   r   r  r   ec_transferr   	kv_eventsr    kv_transferr!   loadr"   lorar#   r   r$   observabilityr%   parallelr&   profilerr'   	schedulerr(   speculativer)   structured_outputsr*   utilsr+   r,   transformersr-   3vllm.model_executor.layers.quantization.base_configr.   vllm.v1.kv_cache_interfacer/   r4   r  r1   IS_QUANTIZEDIS_DENSEr  rI   rL   r%  OPTIMIZATION_LEVEL_00r,  OPTIMIZATION_LEVEL_01FULL_AND_PIECEWISEOPTIMIZATION_LEVEL_02OPTIMIZATION_LEVEL_03r8   r9   r:   r;   r&  r@   r  r  r  r   r  r  r  r  r  typerx  r|   r  r<   r=   r>   <module>r     s      				       % % % % % % - - - - - - - -                         8 8 8 8 8 8 8 8 8 8 8 8  7 7 7 7 7 7 7 7 7 7 * * * * * *       3 3 3 3 3 3 ? ? ? ? ? ? ? ? @ @ @ @ @ @ " " " " " " ( ( ( ( ( ( & & & & & &       J J J J J J J J J J             ) ) ) ) ) ) % % % % % % ) ) ) ) ) )                   . . . . . . $ $ $ $ $ $ $ $ $ $ $ $ & & & & & & * * * * * * 7 7 7 7 7 7 ' ' ' ' ' ' ' ' 
------VVVVVV8888888M	X		* * * * * * * * BL BT B B B BB< BD B B B B $$#"'$$
 
 (,(-    #1/"'$$
 
 (1(-    #1/"'+!'
 
 (:(-    #1/"'+!'
 
 (:(-  " ////	   
**T:::;;;B B B B B B B <; BJ& +/ j4' . . ."t " " " GK34 3434:=*34 34 34 34l 18 8 8

  
  
  
  
  d):         GCLL %) Q cT! 
#q&\	     r=   