
    .`i                         d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ erdd	lmZ dd
lmZ  ee          ZddZddZdS )z
Warmup kernels used during model execution.
This is useful specifically for JIT'ed kernels as we don't want JIT'ing to
happen during model execution.
    )TYPE_CHECKINGN)init_logger)deep_gemm_warmup)current_platform)is_deep_gemm_supported)has_flashinfer)GPUModelRunner)Workerworkerr
   c                 *   t           j        ot                      ot           j        dk    }|r0|                                 }| j        j        }t          ||           t                      r(t          j
        d          rt          | j                   d | j        j        sl| j        j        rbt          fd| j        j        D                       r?t                               d           | j                            ddddd           d S d S d S d S )	NskipZ   c                 V    	 |                                  dk    S # t          $ r Y dS w xY w)N
FLASHINFERF)get_nameNotImplementedErrorbackends    |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/warmup/kernel_warmup.py_is_flashinfer_backendz-kernel_warmup.<locals>._is_flashinfer_backend.   s@    	##%%55" 	 	 	55	s    
((c              3   B   K   | ]}|D ]} |j                   V  d S )Nr   ).0groupsgroupr   s      r   	<genexpr>z kernel_warmup.<locals>.<genexpr>:   s[       
 

 
  #"5=11
 
 
 
 
 
 
    z Warming up FlashInfer attention.   T)
num_tokens	skip_eplb
is_profileforce_attentioncreate_mixed_batch)envsVLLM_USE_DEEP_GEMMr   VLLM_DEEP_GEMM_WARMUP	get_modelscheduler_configmax_num_batched_tokensr   r   r   has_device_capabilityflashinfer_autotunemodel_runneris_pooling_modelattn_groupsallloggerinfo
_dummy_run)r   do_deep_gemm_warmupmodel
max_tokensr   s       @r   kernel_warmupr5      ss    	 	1"$$	1&&0 
  ,  "",C

+++  1,B2FF 1F/000
   0
+
  
 
 
 
 -9
 
 
 
 

 	6777 	&& # 	' 	
 	
 	
 	
 	

 
 
 
 
 
r   runnerr	   returnc                     ddl m} t          j                    5   |            5  |                     | j        j        dd           ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS )a  
    Autotune FlashInfer operations.
    FlashInfer have many implementations for the same operation,
    autotuning runs benchmarks for each implementation and stores
    the results. The results are cached transparently and
    future calls to FlashInfer will use the best implementation.
    Without autotuning, FlashInfer will rely on heuristics, which may
    be significantly slower.
    r   )autotuneT)r   r    N)vllm.utils.flashinferr9   torchinference_moder1   r'   r(   )r6   r9   s     r   r*   r*   L   s    /.....				 	
 	
 	
 	

 	#: 	 	
 	
 	
	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
s4   A,#AA,A	A,A	A,,A03A0)r   r
   )r6   r	   r7   N)__doc__typingr   r;   	vllm.envsr#   vllm.loggerr   +vllm.model_executor.warmup.deep_gemm_warmupr   vllm.platformsr   vllm.utils.deep_gemmr   r:   r   vllm.v1.worker.gpu_model_runnerr	   vllm.v1.worker.gpu_workerr
   __name__r/   r5   r*    r   r   <module>rH      s    !                  # # # # # # H H H H H H + + + + + + 7 7 7 7 7 7 0 0 0 0 0 0 1>>>>>>000000	X		.
 .
 .
 .
b
 
 
 
 
 
r   