
    .`ic                         d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ  e
e          Z G d	 d
e          Zed             Zedefd            ZdS )    )contextmanager)AnyN)
VllmConfig)init_logger)	get_model)CpuGpuBuffer)GPUModelRunnerc                        e Zd Zdedej        f fdZddZddeddfd	Z	de
j        fd
ZddZddZddZdedeeej        dz  f         fdZ xZS )CPUModelRunnervllm_configdevicec                 2   t                      5  t                                          ||           d d d            n# 1 swxY w Y   |t          j        d          k    sJ | j        
J d            d| _        d| _        |                                  d S )Ncpuzspec decode is not supported.F)	_torch_cuda_wrappersuper__init__torchr   speculative_configuse_cuda_graphcascade_attn_enabled_postprocess_tensors)selfr   r   	__class__s      s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/worker/cpu_model_runner.pyr   zCPUModelRunner.__init__   s     "" 	2 	2GG[&111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 e,,,,,,&..0O...#$)!!!#####s   #?AAreturnNc                 T   dt           dt          dd fd}t          |                                           D ]#}t	          |t
                    r|j        |_        $t          | j                  	                                D ]N\  }}|
                    d          r4t	          |t          j                  r || j        ||d d                    O| j        j        j        D ]G}t          |                                          D ]#}t	          |t
                    r|j        |_        $Hd S )Nobjcpu_attr_namer   c                     t          | |d           }t          | |d           }|M|Mt          |t          j                  sJ t          |t          j                  sJ t	          | ||           d S d S d S N)getattr
isinstancer   Tensorsetattr)r   r   device_attr_name
cpu_tensordevice_tensors        r   replace_tensorz;CPUModelRunner._postprocess_tensors.<locals>.replace_tensor!   s     mT::J#C)94@@M%-*C!*el;;;;;!->>>>>-z::::: &%*C*C    _cpu_tensori)r   strvarsvaluesr"   r   r   gpuinput_batchitemsendswithr   r#   block_tableblock_tables)r   r(   vkr2   s        r   r   z#CPUModelRunner._postprocess_tensors   sA   	; 	;C 	;d 	; 	; 	; 	; d""$$ 	 	A!\** )**0022 	= 	=DAqzz-(( =Z5<-H-H =t/AdsdG<<<+7D 	" 	"K+&&--// " "a.. "EAE"	" 	"r)   Feep_scale_upc                     t                               d| j        j                   t	          | j                  | _        | j        r-|                     | j        | j        | j                  | _        d S d S )NzStarting to load model %s...)r   )	loggerinfomodel_configmodelr   r   lora_configload_lora_modelr   )r   r6   s     r   
load_modelzCPUModelRunner.load_model6   sn    2D4E4KLLL4+;<<<
 	Y--dj$:JDKXXDJJJ	Y 	Yr)   c                     | j         S r    )r;   r   s    r   r   zCPUModelRunner.get_model=   s
    zr)   c           	      F   t                               d           t          | j                  5  |                     t          t          d| j                  | j        j	                             d d d            n# 1 swxY w Y   t                               d           d S )Nz'Warming up model for the compilation...   zWarming up done.)
r8   r9    _set_global_compilation_settingsr   
_dummy_runminmaxmax_num_reqsscheduler_configmax_num_batched_tokensr@   s    r   warming_up_modelzCPUModelRunner.warming_up_model@   s    =>>>-d.>?? 	 	OOD-..)@   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	&'''''s   AA<<B B c                     d S r     r@   s    r   _init_device_propertiesz&CPUModelRunner._init_device_propertiesM       r)   c                     d S r    rL   r@   s    r   _sync_devicezCPUModelRunner._sync_deviceP   rN   r)   
num_tokensc                     dS )N)r   NrL   )r   rQ   s     r   get_dp_paddingzCPUModelRunner.get_dp_paddingS   s    wr)   r   N)F)__name__
__module____qualname__r   r   r   r   r   boolr>   nnModuler   rJ   rM   rP   inttupler#   rS   __classcell__)r   s   @r   r   r      s       
$J 
$ 
$ 
$ 
$ 
$ 
$ 
$" " " ".Y Yt Y Y Y Y Y29    ( ( ( (       sEL4<O7O1P        r)   r   c               #   :  K    G d d          }  G d d          }t           j        }t           j        j        }	 | t           _        |t           j        _        d V  |t           _        |t           j        _        d S # |t           _        |t           j        _        w xY w)Nc                       e Zd ZddZdS )._torch_cuda_wrapper.<locals>._EventPlaceholderr   Nc                 &    d | _         d | _        d S )Nc                      d S r    rL   rL   r)   r   <lambda>zI_torch_cuda_wrapper.<locals>._EventPlaceholder.__init__.<locals>.<lambda>\   s    $ r)   c                      d S r    rL   rL   r)   r   rc   zI_torch_cuda_wrapper.<locals>._EventPlaceholder.__init__.<locals>.<lambda>]   s    t r)   )recordsynchronizer   argskwargss      r   r   z7_torch_cuda_wrapper.<locals>._EventPlaceholder.__init__[   s    &,DK+|Dr)   rT   rU   rV   rW   r   rL   r)   r   _EventPlaceholderr`   Z   s(        	, 	, 	, 	, 	, 	,r)   rk   c                       e Zd ZddZdS )/_torch_cuda_wrapper.<locals>._StreamPlaceholderr   Nc                     d S r    rL   rg   s      r   r   z8_torch_cuda_wrapper.<locals>._StreamPlaceholder.__init__`   s    Dr)   rT   rj   rL   r)   r   _StreamPlaceholderrm   _   s(        	 	 	 	 	 	r)   ro   )r   EventcudaStream)rk   ro   
cuda_eventcuda_streams       r   r   r   X   s      , , , , , , , ,
        J*#K('.
 '
 !'
''''s   !A; ;Bconfigc              #      K   dd l mc m} | j        j        }|j        }	 |                    dd          rd|_        d V  ||_        d S # ||_        w xY w)Nr   max_autotuneFT)torch._inductor.config	_inductorru   compilation_configinductor_compile_configfreezingget)ru   torch_inductor_configinductor_configfreezing_values       r   rC   rC   n   s      :::::::::/GO*3N8~u55 	2-1!*)7&&&&7777s   !A
 
	A)
contextlibr   typingr   r   torch.nnrY   vllm.configr   vllm.loggerr    vllm.model_executor.model_loaderr   vllm.v1.utilsr   vllm.v1.worker.gpu_model_runnerr	   rU   r8   r   r   rC   rL   r)   r   <module>r      s.   & % % % % %              " " " " " " # # # # # # 6 6 6 6 6 6 & & & & & & : : : : : :	X		C C C C C^ C C CL ( ( (* 8Z 8 8 8 8 8 8r)   