
    .`i\                        d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZ d dl	m	Z	 d dl
mZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG d dlHmIZImJZJ d d lKmLZL d d!lMmNZN d d"lOmPZP d d#lQmRZRmSZSmTZT d d$lUmVZV d d%lWmXZX  eeY          ZZe G d& d'                      Z[ G d( d)e\          Z] G d* d+e          Z^dS ),    N)AsyncGeneratorIterableMapping)copy)	dataclass)Any)TokensPrompt)
VllmConfig)AsyncEngineArgs)EngineClient)_validate_truncation_size)
PromptType)init_logger)LoRARequest)MULTIMODAL_REGISTRYMultiModalRegistry)STREAM_FINISHEDPoolingRequestOutputRequestOutput)get_io_processor)PoolingParams)RendererLike)RequestOutputKindSamplingParams)SupportedTask)TokenizerLike)init_tracer)(maybe_register_config_serialize_by_value)UsageContext)cancel_task_threadsafe)as_list)EngineCoreRequest)EngineCoreClient)EngineDeadErrorEngineGenerateError)InputProcessor)OutputProcessorRequestOutputCollector)ParentRequest)get_prompt_text)Executor)StatLoggerFactoryStatLoggerManager!load_stat_logger_plugin_factories)shutdown_prometheus)IterationStatsc                   2    e Zd ZU dZeed<   dZedz  ed<   dS )StreamingInputzInput data for a streaming generation request.

    This is used with generate() to support multi-turn streaming sessions
    where inputs are provided via an async generator.
    promptNsampling_params)__name__
__module____qualname____doc__r   __annotations__r4   r        l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/engine/async_llm.pyr2   r2   8   s?           -1O^d*11111r;   r2   c                   (     e Zd ZdZdef fdZ xZS )InputStreamErrorzWrapper for errors from the input stream generator.

    This is used to propagate errors from the user's input generator
    without wrapping them in EngineGenerateError.
    causec                 r    || _         t                                          t          |                     d S N)r?   super__init__str)selfr?   	__class__s     r<   rC   zInputStreamError.__init__K   s.    
U$$$$$r;   )r5   r6   r7   r8   	ExceptionrC   __classcell__)rF   s   @r<   r>   r>   D   sN         %i % % % % % % % % % %r;   r>   c                      e Zd Zej        eddddddddf
dedee         de	d	ed
e
de	de	de	dee         dz  de	deeef         dz  dededdfdZedej        dddddddf	dede	d	edee         dz  de	de	de	deeef         dz  dededd fd            Zedej        dfdede	d	edee         dz  dd f
d            Zd Zd Zdeedf         fdZ	 	 	 	 	 	 	 dededeez  eedf         z  d eez  d!e dz  d"e!dz  d#eee"f         dz  d$e#eef         dz  d%ed&edz  d'edz  de$fd(Z%d)ededz  d*e&dz  d+ed,e$f
d-Z'	 	 	 	 	 	 dfded.eedf         d/eez  d!e dz  d"e!dz  d#eee"f         dz  d$e#eef         dz  d%ed&edz  de$fd0Z(e)d eez  fd1            Z*ddddddd2deez  eedf         z  d/eded'edz  d"e!dz  d#eee"f         dz  d$e#eef         dz  d%ed&edz  dee+df         fd3Z,d4 Z-	 dgdee.e         z  d5e	ddfd6Z/ddd7d8e	d9e	ddfd:Z0dhd;Z1de	fd<Z2	 	 	 	 	 dided=eded"e!dz  d$e#eef         dz  d%ed>edz  d#eee"f         dz  dee3df         fd?Z4e5de6dz  fd@            Z7de6fdAZ8e5de9fdB            Z:de	fdCZ;dhdDZ<dhdEZ=dhdFZ>dhdGZ?dhdHZ@	 djdIe	dJe	de	fdKZAdkdLeddfdMZBdldNee         dz  ddfdOZCde	fdPZDd"e!de	fdQZEdRede	fdSZFdeGe         fdTZHdRede	fdUZI	 	 	 dmdWedXe dz  dYedZedz  fd[ZJdnd]efd^ZK	 dnd_ed]efd`ZLe5de	fda            ZMe5de	fdb            ZNe5de	fdc            ZOe5dePfdd            ZQdS )oAsyncLLMFTN   r   vllm_configexecutor_class	log_statsusage_contextmm_registryuse_cached_outputslog_requestsstart_engine_loopstat_loggersaggregate_engine_loggingclient_addressesclient_countclient_indexreturnc           	         t                       |j        | _        || _        |j        | _        || _        t          |	pg           }|                    t                                 t          |          }|p|| _	        |s|rt                              d           t          | j                  | _        t          | j        | j        j                  | _        t#          | j        | j	        | j        j        j                  | _        | j        j        }|t/          d|          }|| j        _        t3          j        ||| j	        |||          | _        d| _        | j	        r=t;          || j        j        ||||
          | _        | j                                         tA          j!                    | _"        d| _#        d| _$        	 tA          j%                     | &                                 n# tN          $ r Y nw xY w|j(        j)        dk    r|j(        j*        s|j(        j+        }t                              d	|           tY          j-                     d
t]          j/                     d}t`          j)        1                    t`          j)        j2        j3        g|j(        j4        t`          j)        5                    |||j(        j6                            | _)        dS d| _)        dS )a  
        Create an AsyncLLM.

        Args:
            vllm_config: global configuration.
            executor_class: an Executor impl, e.g. MultiprocExecutor.
            log_stats: Whether to log stats.
            usage_context: Usage context of the LLM.
            mm_registry: Multi-modal registry.
            use_cached_outputs: Whether to use cached outputs.
            log_requests: Whether to log requests.
            start_engine_loop: Whether to start the engine loop.
            stat_loggers: customized stat loggers for the engine.
                If not provided, default stat loggers will be used.
                PLEASE BE AWARE THAT STAT LOGGER IS NOT STABLE
                IN V1, AND ITS BASE CLASS INTERFACE MIGHT CHANGE.

        Returns:
            None
        zyAsyncLLM created with log_stats=False, but custom stat loggers were found; enabling logging without default stat loggers.)rN   stream_intervalNzvllm.llm_engine)rL   rM   rN   rV   rW   rX   )rL   engine_idxscustom_stat_loggersenable_default_loggersrW   rU   FtorchzFTorch profiler enabled. AsyncLLM CPU traces will be collected under %s_z
.async_llm)worker_nameuse_gzip)
activities
with_stackon_trace_ready)7r   model_configrL   observability_configrR   listextendr.   boolrN   loggerinfor&   input_processorr   io_processor_pluginio_processorr'   	tokenizerscheduler_configr[   output_processorotlp_traces_endpointr   tracerr#   make_async_mp_clientengine_corelogger_managerr-   engine_ranks_managedlog_engine_initializedasyncio	Condition_pause_cond_pausedoutput_handlerget_running_loop_run_output_handlerRuntimeErrorprofiler_configprofilerignore_frontendtorch_profiler_dirsocketgethostnameosgetpidr_   profileProfilerActivityCPUtorch_profiler_with_stacktensorboard_trace_handlertorch_profiler_use_gzip)rE   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   r]   has_custom_loggersendpointrt   profiler_dirra   s                       r<   rC   zAsyncLLM.__init__Q   s   J 	1222'4&$/$D!("<#5266""#D#F#FGGG!"566"8&8 	/ 	KKA    .d.>??,1
 
 !0Nn ,=M!
 !
 !

 ,A !2H==F+1D!( ,@#)n-%%
 
 
 9=> 		9"3' ,A$7'0))A# # #D 66888 #,..37	$&&&$$&&&& 	 	 	D	 '0G;;/? < '6ILKKX   $/11KKBIKKKKKK!N22N37 '6P$~GG  +(8P  H     3 
 
DMMM !DMMMs   
'G2 2
G?>G?enable_log_requestsdisable_log_statsc                 T     | |t          j        |          |||| ||||	|
          S )N)rL   rM   rS   rT   rR   rN   rU   rO   rV   rW   rX   )r+   	get_class)clsrL   rS   rO   rT   r   rU   r   rV   rW   rX   s              r<   from_vllm_configzAsyncLLM.from_vllm_config   sL     s##-k::/%,++%='-%%
 
 
 	
r;   engine_argsc           	          |                     |          }t          j        |          } | |||j        |j         |||          S )z'Create an AsyncLLM from the EngineArgs.)rL   rM   rR   rN   rS   rO   rT   )create_engine_configr+   r   r   r   )r   r   rS   rO   rT   rL   rM   s          r<   from_engine_argszAsyncLLM.from_engine_args   s`     "66}EE!+K88 s#)$8%77/'%
 
 
 	
r;   c                 .    |                                   d S rA   )shutdownrE   s    r<   __del__zAsyncLLM.__del__  s    r;   c                    t                       t          | dd          x}r|                                 t          | dd          x}r|                                 t          | dd          }|t	          |           dS dS )z2Shutdown, cleaning up the background proc and IPC.rv   Nrm   r~   )r/   getattrr   closer    )rE   rv   rm   handlers       r<   r   zAsyncLLM.shutdown  s     	!$t<<<; 	#  """%d,=tDDD? 	$!!###$ 0$77"7+++++ r;   .c                 D   K   | j                                          d {V S rA   )rv   get_supported_tasks_asyncr   s    r<   get_supported_taskszAsyncLLM.get_supported_tasks  s-      %??AAAAAAAAAr;   
request_idr3   paramsarrival_timelora_requesttokenization_kwargstrace_headersprioritydata_parallel_rankprompt_textc                 P   K    j         rt                      t          |t                    } j        j        j        r|s|j        rt          d          |i }t           j
        j        |j        |           t          |t                    r#                     |||||||||		  	         d{V S t          |t                    r(|}||j        k    rt"                              d           nB|
t          d           j                            |||||||||		  	        }t+          |          }
 j                            |                                              j        4 d{V   j                             fd           d{V  ddd          d{V  n# 1 d{V swxY w Y   t5          |j        |j                  }|j        }|s|j        dk    r!                     ||
dd|           d{V  |S |}t          |t>                    sJ tA          |          }tC          |j                  D ]f}|"                    |          \  }}||j        dz
  k    r|ntG          |          }||_        ||_$                             ||
|||           d{V  g|S )z Add new request to the AsyncLLM.z--kv-sharing-fast-prefill produces incorrect logprobs for prompt tokens, please disable it when the requests need prompt logprobsNzAsyncLLM.add_request() was passed a request_id parameter that does not match the EngineCoreRequest.request_id attribute. The latter will be used, and the former will be ignored.z6should only provide prompt_text with EngineCoreRequestc                       j          S rA   )r}   r   s   r<   <lambda>z&AsyncLLM.add_request.<locals>.<lambda>y  s    4D r;   rK   r   )%erroredr$   
isinstancer   rL   cache_configkv_sharing_fast_prefillprompt_logprobs
ValueErrorr   rf   max_model_lentruncate_prompt_tokensr   _add_streaming_input_requestr"   r   rk   warning_oncerm   process_inputsr*   assign_request_idr   r|   wait_forr(   output_kindr   n_add_requestr   r)   rangeget_child_infor   r4   )rE   r   r3   r   r   r   r   r   r   r   r   
is_poolingrequestqueueparent_paramsparent_requestidxchild_paramschild_requests   `                  r<   add_requestzAsyncLLM.add_request   s&      < 	$!###66
 )A				 &		
 "   &"$!+)	
 	
 	
 fn-- 	::#"
 
 
 
 
 
 
 
 
 f/00 	2GW///##K   & L   *99#"
 
G *&11K..w777
 	  """ # 	F 	F 	F 	F 	F 	F 	F 	F"++,D,D,D,DEEEEEEEEE	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 'v'97;MNN  	Q##G[$5IIIIIIIIIL-88888 'w//)) 	 	C'5'D'DS'I'I$J'*mo.A'A'AGGtG}}M'1M$,8M)##{NC          s   -$F##
F-0F-r   
parent_reqindexr   c                    K   | j                             |||||           | j                            |           d {V  | j        r"t
                              d|j                   d S d S )NzAdded request %s.)rr   r   rv   add_request_asyncrR   rk   rl   r   )rE   r   r3   r   r   r   s         r<   r   zAsyncLLM._add_request  s       	))'6:ueTTT 00999999999 	AKK+W-?@@@@@	A 	Ar;   input_streamr4   c
                    K                                    t          ||||||	          j        s                                d_          j        j        dt          dg          d j                                       j        t          j
                   fd}
                                  t          j         |
                      _        S )N)r   r   r   r   r   r   Tr   )prompt_token_ids)r   r3   r   c            	        K   d} 	 2 3 d {V }|j         }|r                    |           n} j        j        d	|j        |dd}|_        |j        t          d          t          |j                  }	                    ||d d
           d {V  6 nR# t          j        t          f$ r d} Y n8t          $ r,}
                    t          |                     Y d }~nd }~ww xY wd 
_        | s!	                    d d d
           d {V  d S d S # d 
_        | s 	                    d d d
           d {V  w w xY w)NFT)r   r3   r   	resumablez0prompt_embeds not supported for streaming inputsr   r:   )r4   )_validate_streaming_input_sampling_paramsrm   r   r3   external_req_idprompt_embedsr   r*   r   rz   CancelledErrorGeneratorExitrG   putr>   _input_stream_task)	cancelledinput_chunkspreqr   error	final_reqr   inputsinternal_req_idr   r   r4   rE   s         r<   handle_inputsz<AsyncLLM._add_streaming_input_request.<locals>.handle_inputs  s.     I M)5 N N N N N N N+$4B -FFrJJJJ,=$.= #2*1!"&	 
 ! C +5C'(4(N   #2+2D"E"EK++CdAuMMMMMMMMMM' *6( *M: ! ! ! 			 3 3 3 		*511222222223
 ,0(  M ++ItT1eLLLLLLLLLLLM M ,0(  M ++ItT1eLLLLLLLLLLMsF   B  BBB  D  C/8D :	C/"C*%D *C//D +E	r:   )r   dict
skip_cloneclonerm   r   r	   r   r   r(   r   r   rz   create_taskr   )rE   r   r   r4   r   r   r   r   r   r   r   r   r   r   r   s   ````       @@@@r<   r   z%AsyncLLM._add_streaming_input_request  sa      	66GGG%% 3'1
 
 
 ) 	.-3355O)-O& 8D(7 
!!555"
 
 	
 
	 	..y999#.&'BOTT"	M "	M "	M "	M "	M "	M "	M "	M "	M "	M "	M "	MJ 	  """#*#6}}#G#G r;   c                     t          | t                    r'| j        dk    s| j        t          j        k    s| j        rt          d          d S )NrK   zrInput streaming not currently supported for pooling models, n > 1, request_kind = FINAL_ONLY or with stop strings.)r   r   r   r   r   
FINAL_ONLYstopr   )r   s    r<   r   z2AsyncLLM._validate_streaming_input_sampling_params  s`    
 6>22
	x!||!%6%AAA{ B (   BAr;   )r   r   r   r   r   r   c                 K   d}
	 |                      ||||||||	|	  	         d{V }
d}|s\|
                                p|
                                 d{V }t          |t                    sJ |j        }|t          ur|W V  |\n# t          j        t          f$ rH |
"| 
                    |
j        d           d{V  | j        rt                              d|            t          $ r$ | j        rt                              d|            t           $ r)}| j        rt                              d||            d}~wt"          $ rT}|
"| 
                    |
j        d           d{V  | j        rt                              d	||           |j        |d}~wt&          $ r}|
"| 
                    |
j        d           d{V  | j        re	 |j        j         d
| }n6# t&          $ r)}|j        j         d
dz   |j        j        z   }Y d}~nd}~ww xY wt                              d||           t-                      |d}~ww xY w|
|
                                 dS dS # |
|
                                 w w xY w)aj  
        Main function called by the API server to kick off a request
            * 1) Making an AsyncStream corresponding to the Request.
            * 2) Processing the Input.
            * 3) Adding the Request to the Detokenizer.
            * 4) Adding the Request to the EngineCore (separate process).

        A separate output_handler loop runs in a background AsyncIO task,
        pulling outputs from EngineCore and putting them into the
        per-request AsyncStream.

        The caller of generate() iterates the returned AsyncGenerator,
        returning the RequestOutput back to the caller.
        N)r   r   r   r   r   r   FTinternalRequest %s aborted. Request %s failed (engine dead).z$Request %s failed (bad request): %s.z$Request %s failed (input error): %s.z: z+error during printing an exception of classzRequest %s failed due to %s.)r   
get_nowaitgetr   r   finishedr   rz   r   r   abortr   rR   rk   rl   r$   r   r>   r?   rG   rF   r5   r%   r   )rE   r3   r4   r   r   r   r   r   r   r   qr   outese2s                   r<   generatezAsyncLLM.generate  si     8 ,0K	&&)$7+!#5' ' 
 
 
 
 
 
 
 
A H 
 llnn5aeegg "#}55555<o--IIII  
 &6 	 	 	}jjj=========  ?1:>>>  	 	 	  L>
KKK  	 	 	  SBJPQRRR   	! 	! 	!}jjj=========  SBJPQRRR'q   	/ 	/ 	/}jjj=========  	K;/66166AA    ;/333GH,/0 AAAAA :JJJJ%''Q.	/ }					 }q}				 sn   BB 
I, BI$EIAFI++I
G)(I
)
H3HI
H.I

II, ,Jc                     | j         dS | j        | j        | j        | j        | j        t          j        fd}t          j	         |                      | _         dS )zBBackground loop: pulls from EngineCore and pushes to AsyncStreams.Nc                    K   	 	 
                                  d {V } t          | j                  }r|rt                      nd }| j        }t	          d|	          D ]}}|	z   }|||         }                    || j        |          }|j        rJ ||k     rt          j	        d           d {V  |j
        r 
                    |j
                   d {V  ~                    | j                   r5                    | j        | j        |                                           +# t"          $ r:}t$                              d                               |           Y d }~d S d }~ww xY w)NTr   )
engine_idxscheduler_statsiteration_statsmm_cache_statszAsyncLLM output_handler failed.)get_output_asynclenoutputsr0   r   process_outputs	timestamprequest_outputsrz   sleepreqs_to_abortabort_requests_asyncupdate_scheduler_statsr  recordengine_indexstat_mm_cacherG   rk   	exceptionpropagate_error)r  num_outputsr  engine_core_outputsstartendoutputs_sliceprocessed_outputsr   
chunk_sizerv   rm   rN   rw   rr   s            r<   r~   z4AsyncLLM._run_output_handler.<locals>.output_handler  s
     04,$/$@$@$B$BBBBBBBG"%go"6"6K .7Q;Q(((T $ +2/'!&q+z!B!B  #j0(;E#I(F,<,L,L)7+<o- -) $5#DDDD ,,")-"2"22222222 -: "-"B"B 1 ?# #        %;;G<STTT
 & &--'.';,3,C,;+:+H+H+J+J	 .   O,Z  4 4 4  !BCCC 003333333334s   D,D1 1
E5;/E00E5)
r~   rv   rr   rN   rw   rm   envsVLLM_V1_OUTPUT_PROC_CHUNK_SIZErz   r   )rE   r~   r  rv   rm   rN   rw   rr   s     @@@@@@r<   r   zAsyncLLM._run_output_handlerv  s     *F &0N	,.8
1	4 1	4 1	4 1	4 1	4 1	4 1	4 1	4 1	4 1	4f &1..2B2BCCr;   r   c                 <  K   t          |t                    r|fnt          |          }| j                            ||          }| j                            |           d{V  | j        r0t          	                    dd
                    |                     dS dS )z2Abort RequestId in OutputProcessor and EngineCore.NzAborted request(s) %s.,)r   rD   r!   rr   abort_requestsrv   r  rR   rk   rl   join)rE   r   r   request_idsall_request_idss        r<   r   zAsyncLLM.abort  s       (
C88QZMMgj>Q>Q 	 />>{HUU33ODDDDDDDDD 	IKK0#((;2G2GHHHHH	I 	Ir;   )wait_for_inflight_requestsclear_cacher"  r#  c                (  K   | j         4 d{V  | j        r	 ddd          d{V  dS d| _        ddd          d{V  n# 1 d{V swxY w Y   |sJt          | j        j                                                  }|r|                     |d           d{V  | j                                        r| j                                         d{V  |r6| 	                                 d{V  | 
                                 d{V  dS dS )a4  
        Pause generation to allow model weight updates.

        New generation/encoding requests are blocked until resume.

        Args:
            wait_for_inflight_requests: When ``True`` waits for in-flight
                requests to finish before pausing. When ``False`` (default),
                immediately aborts any in-flight requests.
            clear_cache: Whether to clear KV cache and prefix cache after
                draining. Set to ``False`` to preserve cache for faster resume.
                Default is ``True`` (clear caches).
        NTr   )r|   r}   rh   rr   request_stateskeysr   has_unfinished_requestswait_for_requests_to_drainreset_prefix_cachereset_mm_cache)rE   r"  r#  r   s       r<   pause_generationzAsyncLLM.pause_generation  s     ( # 	  	  	  	  	  	  	  	 | 	  	  	  	  	  	  	  	  	  	  	  	  	  	   DL	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 
 * 	=t4CHHJJKKK =jjtj<<<<<<<<<  88:: 	E'BBDDDDDDDDD  	())+++++++++%%'''''''''''	( 	(s   	AA
AAc                    K   | j         4 d{V  d| _        | j                                          ddd          d{V  dS # 1 d{V swxY w Y   dS )z1Resume generation after :meth:`pause_generation`.NF)r|   r}   
notify_allr   s    r<   resume_generationzAsyncLLM.resume_generation  s       # 	* 	* 	* 	* 	* 	* 	* 	* DL'')))	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	* 	*s   !A
AAc                 z   K   | j         4 d{V  | j        cddd          d{V  S # 1 d{V swxY w Y   dS )z.Return whether the engine is currently paused.N)r|   r}   r   s    r<   	is_pausedzAsyncLLM.is_paused  s       # 	  	  	  	  	  	  	  	 <	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	  	 s   *
44pooling_paramsr   c	           	       K   d}		 |t          j        dt          d           |                     |||||||           d{V }	d}
|
sS|	                                p|	                                 d{V }t          |t                    sJ |j        }
|W V  |
Sn# t          j
        $ rH |	"|                     |	j        d           d{V  | j        rt                              d	|            t           $ r$ | j        rt                              d
|            t"          $ r$ | j        rt                              d|            t$          $ rZ}|	"|                     |	j        d           d{V  | j        rt                              d|           t'                      |d}~ww xY w|	|	                                 dS dS # |	|	                                 w w xY w)a  
        Main function called by the API server to kick off a request
            * 1) Making an AsyncStream corresponding to the Request.
            * 2) Processing the Input.
            * 3) Adding the Request to the EngineCore (separate process).

        A separate output_handler loop runs in a background AsyncIO task,
        pulling outputs from EngineCore and putting them into the
        per-request AsyncStream.

        The caller of generate() iterates the returned AsyncGenerator,
        returning the RequestOutput back to the caller.

        NOTE: truncate_prompt_tokens is deprecated in v0.14.
        TODO: Remove truncate_prompt_tokens in v0.15.
        NzThe `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` is deprecated and will be removed in v0.15. Please use `pooling_params.truncate_prompt_tokens` instead.   )
stacklevel)r   r   r   r   FTr   r   r   z Request %s failed (bad request).zRequest %s failed.)warningswarnDeprecationWarningr   r   r   r   r   r   rz   r   r   r   rR   rk   rl   r$   r   rG   r%   r   )rE   r3   r1  r   r   r   r   r   r   r   r   r   r   s                r<   encodezAsyncLLM.encode  s     8 ,0?	%1R '     &&)$7+! '        A H  llnn5aeegg!#';<<<<< <				   % 	 	 	}jjj=========  ?1:>>>  	 	 	  L>
KKK  	 	 	  L>
KKK  	/ 	/ 	/}jjj=========  >0*===%''Q.	/ }					 }q}				 s,   BB G B7F0AF++F00G G&c                     | j         j        S rA   )rm   rp   r   s    r<   rp   zAsyncLLM.tokenizerZ  s    #--r;   c                 4    | j                                         S rA   )rm   get_tokenizerr   s    r<   r;  zAsyncLLM.get_tokenizer^  s    #11333r;   c                     | j         j        S rA   )rm   rendererr   s    r<   r=  zAsyncLLM.renderera  s    #,,r;   c                 "   K   | j         j        d uS rA   )rg   rs   r   s    r<   is_tracing_enabledzAsyncLLM.is_tracing_enablede  s      (=TIIr;   c                 N   K   | j         r| j                                          d S d S rA   )rw   logr   s    r<   do_log_statszAsyncLLM.do_log_statsh  s6       	&##%%%%%	& 	&r;   c                 Z   K   t                               d           | j        r| j        d S )NzCalled check_health.)rk   debugr   
dead_errorr   s    r<   check_healthzAsyncLLM.check_healthl  s5      +,,,< 	"/!	" 	"r;   c                    K   | j                             d          g}| j        1|                    t	          j        | j        j                             t	          j        |  d {V  d S )NT)rv   profile_asyncr   appendrz   	to_threadr  gatherrE   coross     r<   start_profilezAsyncLLM.start_profileq  sn      !//556=$LL*4=+>??@@@ne$$$$$$$$$$r;   c                    K   | j                             d          g}| j        1|                    t	          j        | j        j                             t	          j        |  d {V  d S )NF)rv   rH  r   rI  rz   rJ  r   rK  rL  s     r<   stop_profilezAsyncLLM.stop_profilew  sn      !//667=$LL*4=+=>>???ne$$$$$$$$$$r;   c                 z   K   | j                                          | j                                         d {V  d S rA   )rm   clear_mm_cacherv   reset_mm_cache_asyncr   s    r<   r*  zAsyncLLM.reset_mm_cache}  sH      ++---3355555555555r;   reset_running_requestsreset_connectorc                 H   K   | j                             ||           d {V S rA   )rv   reset_prefix_cache_async)rE   rT  rU  s      r<   r)  zAsyncLLM.reset_prefix_cache  sF       %>>"O
 
 
 
 
 
 
 
 	
r;   levelc                    K   |                                   d {V  | j                            |           d {V  | j        | j                            d|           d S d S )NrK   )r)  rv   sleep_asyncrw   record_sleep_state)rE   rX  s     r<   r
  zAsyncLLM.sleep  s      %%'''''''''**5111111111*221e<<<<< +*r;   tagsc                    K   | j                             |           d {V  | j        | j                            dd           d S d S )Nr   )rv   wake_up_asyncrw   r[  )rE   r\  s     r<   wake_upzAsyncLLM.wake_up  s_      ,,T222222222*221a88888 +*r;   c                 D   K   | j                                          d {V S rA   )rv   is_sleeping_asyncr   s    r<   is_sleepingzAsyncLLM.is_sleeping  s-      %77999999999r;   c                 F   K   | j                             |           d{V S )z<Load a new LoRA adapter into the engine for future requests.N)rv   add_lora_async)rE   r   s     r<   add_lorazAsyncLLM.add_lora  s/      %44\BBBBBBBBBr;   lora_idc                 F   K   | j                             |           d{V S )z&Remove an already loaded LoRA adapter.N)rv   remove_lora_asyncrE   rf  s     r<   remove_lorazAsyncLLM.remove_lora  s/      %77@@@@@@@@@r;   c                 D   K   | j                                          d{V S )zList all registered adapters.N)rv   list_loras_asyncr   s    r<   
list_loraszAsyncLLM.list_loras  s-      %66888888888r;   c                 F   K   | j                             |           d{V S )z&Prevent an adapter from being evicted.N)rv   pin_lora_asyncri  s     r<   pin_lorazAsyncLLM.pin_lora  s/      %44W=========r;   r:   methodtimeoutargskwargsc                 L   K   | j                             ||||           d{V S )zB
        Perform a collective RPC call to the given path.
        N)rv   collective_rpc_async)rE   rq  rr  rs  rt  s        r<   collective_rpczAsyncLLM.collective_rpc  sJ       %::GT6
 
 
 
 
 
 
 
 	
r;   ,  drain_timeoutc                   K   t          j                     }t          j                     |z
  |k     r| j                                        st                              d           dS t                              d           t          j        d           d{V  t          j                     |z
  |k     t          d| d          )z$Wait for all requests to be drained.z,Engines are idle, requests have been drainedNz;Engines are still running, waiting for requests to drain...rK   zTimeout reached after z' seconds waiting for requests to drain.)timerv   dp_engines_runningrk   rl   rz   r
  TimeoutError)rE   ry  
start_times      r<   r(  z#AsyncLLM.wait_for_requests_to_drain  s      Y[[
ikkJ&66#6688 JKKKKKUVVV-""""""""" ikkJ&66 -] - - -
 
 	
r;   new_data_parallel_sizec                   K   | j         j        j        }||k    rt                              d|           dS t                              d|           |                     |           d{V  t                              d|           | j                            |           d{V  || j         j        _        ||k    r?| j        r:t          | j         t          t          |                    d          | _        dS dS dS )a  
        Scale up or down the data parallel size by adding or removing
        engine cores.
        Args:
            new_data_parallel_size: The new number of data parallel workers
            drain_timeout:
                Maximum time to wait for requests to drain (seconds)
        z0Data parallel size is already %s, skipping scaleNz@Waiting for requests to drain before scaling up to %s engines...z?Requests have been drained, proceeding with scale to %s engines)rL   r\   r]   )rL   parallel_configdata_parallel_sizerk   rl   r(  rv   scale_elastic_eprN   r-   rh   r   rw   )rE   r  ry  old_data_parallel_sizes       r<   r  zAsyncLLM.scale_elastic_ep  s?      "&!1!A!T!%;;;KKB&   FN"	
 	
 	
 --m<<<<<<<<<M"	
 	
 	
 //0FGGGGGGGGG>T(; "$:::t~:
 #4 , '=!>!>??$(# # #D ;:::r;   c                 H    | j         d u p| j                                          S rA   )r~   doner   s    r<   
is_runningzAsyncLLM.is_running  s)     "d*L$2E2J2J2L2L.LLr;   c                     | j         S rA   )r   r   s    r<   
is_stoppedzAsyncLLM.is_stopped  s
    |r;   c                 4    | j         j        j        p| j         S rA   )rv   	resourcesengine_deadr  r   s    r<   r   zAsyncLLM.errored  s    )5LT_9LLr;   c                     t                      S rA   )r$   r   s    r<   rE  zAsyncLLM.dead_error  s       r;   )NNNNr   NN)NNNNr   N)F)rY   N)NNr   NN)FF)rK   rA   )Nr:   N)rx  )Rr5   r6   r7   r   ENGINE_CONTEXTr   r
   typer+   rj   r   rh   r,   r   rD   intrC   classmethodr   r   r   r   r   tupler   r   r"   r   r   r2   r   r   floatr   r   r   r(   r   r)   r   r   staticmethodr   r   r   r   r   r   r+  r.  r0  r   r8  propertyr   rp   r;  r   r=  r?  rB  rF  rN  rP  r*  r)  r
  r_  rb  re  rj  setrm  rp  rw  r(  r  r  r  r   BaseExceptionrE  r:   r;   r<   rJ   rJ   P   s
        '3&A*=#(!"&7;).26B! B!B! XB! 	B!
 $B! (B! !B! B!  B! ,-4B! #'B! sCx.4/B! B! B! 
B! B! B! B!H  #'&2&A7;$))."'26
 

  
 $	

 ,-4
 "
 #'
  
 sCx.4/
 
 
 

 
 
 [
8  #'&2&A7;
 
$
  
 $	

 ,-4
 

 
 
 [
0  , , ,B51C+D B B B B &*+/5926)-"&r rr "J.PT@T1UUr .	r
 dlr "D(r "#s(^d2r sCx(4/r r  $Jr 4Zr 
 r r r rhA"A d
A "D(	A
 A &A A A A, &*+/5926)-P PP %^T%9:P (-7	P
 dlP "D(P "#s(^d2P sCx(4/P P  $JP 
 P P P Pd .   \4 #'+/5926)-h h h!J.PT@T1UUh (h 	h 4Zh "D(h "#s(^d2h sCx(4/h h  $Jh 
t+	,h h h hTBD BD BDJ AFI I-I9=I	I I I I" ,1 	%( %( %( %)%( 	%(
 
%( %( %( %(N* * * *          ,026-159\ \\ &\ 	\
 "D(\ sCx(4/\ \ !$d
\ "#s(^d2\ 
,d2	3\ \ \ \| .=4/ . . . X.4} 4 4 4 4 -, - - - X-J$ J J J J& & & &" " " "
% % % %% % % %6 6 6 6
 MR
 
&*
EI
	
 
 
 
= = =T = = = =9 9$s)d"2 9d 9 9 9 9:4 : : : :C; C4 C C C CA A A A A A9#c( 9 9 9 9>c >d > > > > !%"
 

 
 	

 t
 
 
 

 
c 
 
 
 
" AD( (&)(:=( ( ( (T MD M M M XM D    X M M M M XM !M ! ! ! X! ! !r;   rJ   )_rz   r   r   r{  r5  collections.abcr   r   r   r   dataclassesr   typingr   r_   	vllm.envsr  vllmr	   vllm.configr
   vllm.engine.arg_utilsr   vllm.engine.protocolr   vllm.entrypoints.utilsr   vllm.inputsr   vllm.loggerr   vllm.lora.requestr   vllm.multimodalr   r   vllm.outputsr   r   r   vllm.plugins.io_processorsr   vllm.pooling_paramsr   vllm.renderersr   vllm.sampling_paramsr   r   
vllm.tasksr   vllm.tokenizersr   vllm.tracingr   vllm.transformers_utils.configr   vllm.usage.usage_libr   vllm.utils.async_utilsr    vllm.utils.collection_utilsr!   vllm.v1.enginer"   vllm.v1.engine.core_clientr#   vllm.v1.engine.exceptionsr$   r%   vllm.v1.engine.input_processorr&   vllm.v1.engine.output_processorr'   r(    vllm.v1.engine.parallel_samplingr)   vllm.v1.engine.utilsr*   vllm.v1.executorr+   vllm.v1.metrics.loggersr,   r-   r.   vllm.v1.metrics.prometheusr/   vllm.v1.metrics.statsr0   r5   rk   r2   rG   r>   rJ   r:   r;   r<   <module>r     s    				    = = = = = = = = = =       ! ! ! ! ! !                    " " " " " " 1 1 1 1 1 1 - - - - - - < < < < < < " " " " " " # # # # # # ) ) ) ) ) ) C C C C C C C C M M M M M M M M M M 7 7 7 7 7 7 - - - - - - ' ' ' ' ' ' B B B B B B B B $ $ $ $ $ $ ) ) ) ) ) ) $ $ $ $ $ $ S S S S S S - - - - - - 9 9 9 9 9 9 / / / / / / , , , , , , 7 7 7 7 7 7 J J J J J J J J 9 9 9 9 9 9 S S S S S S S S : : : : : : 0 0 0 0 0 0 % % % % % %         
 ; : : : : : 0 0 0 0 0 0	X		 2 2 2 2 2 2 2 2	% 	% 	% 	% 	%y 	% 	% 	%o! o! o! o! o!| o! o! o! o! o!r;   