
    .`i=              	          d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d d	lmZ d d
lmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( erd dl)Z*d dl+m,Z, d dl-m.Z.m/Z/  ee0          Z1 ed          Z2 G d dee2         e          Z3 G d d          Z4d2de5de6de7de6fdZ8 G d d          Z9	 	 d3de9ded         dz  ded          ddfd!Z:d"e;e         fd#Z<d$ej=        d%ej=        d&e7dej=        fd'Z>ej?        fd(eddfd)Z@daAd*e6defd+ZBd,ej=        deCfd-ZDe
 G d. d/                      ZEd0e(deEfd1ZFdS )4    N)CallableSequence)AbstractContextManager)	dataclass)
connection)BaseProcess)TYPE_CHECKINGAnyGenericOptionalTypeVarUnionoverload)record_function)init_logger)UsageContextis_usage_stats_enabledusage_message)get_open_portget_open_zmq_ipc_pathget_tcp_uri)kill_process_tree)SchedulerOutput)DPCoordinatorCoreEngineActorManagerCoreEngineProcManagerTc            	          e Zd Zdee         ddfdZd Zd Zd Zd Z	d	 Z
d
 Zddedededz  defdZededefd            Zededee         fd            Zdeez  deee         z  fdZededefd            Zededefd            Zdeez  deee         z  fdZd Zd Zd Zd Zd Zdee         fdZdS )ConstantListxreturnNc                     || _         d S N_x)selfr!   s     a/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/v1/utils.py__init__zConstantList.__init__-   s        c                      t          d          )Nz Cannot append to a constant list	TypeErrorr'   items     r(   appendzConstantList.append0   s    :;;;r*   c                      t          d          )NzCannot extend a constant listr,   r.   s     r(   extendzConstantList.extend3   s    7888r*   c                      t          d          )Nz"Cannot insert into a constant listr,   r.   s     r(   insertzConstantList.insert6       <===r*   c                      t          d          )NzCannot pop from a constant listr,   r.   s     r(   popzConstantList.pop9   s    9:::r*   c                      t          d          )Nz"Cannot remove from a constant listr,   r.   s     r(   removezConstantList.remove<   r5   r*   c                      t          d          )NzCannot clear a constant listr,   r'   s    r(   clearzConstantList.clear?   s    6777r*   r   r/   startstopc                 f    | j                             ||||nt          | j                             S r$   )r&   indexlen)r'   r/   r=   r>   s       r(   r@   zConstantList.indexB   s+    w}}T5$2B$$DGUUUr*   c                     d S r$    r.   s     r(   __getitem__zConstantList.__getitem__E   s    +.3r*   sc                    d S r$   rC   )r'   rE   s     r(   rD   zConstantList.__getitem__H   s    363r*   c                     | j         |         S r$   r%   r.   s     r(   rD   zConstantList.__getitem__K   s    wt}r*   valuec                     d S r$   rC   r'   r/   rH   s      r(   __setitem__zConstantList.__setitem__N   s    03r*   c                    d S r$   rC   )r'   rE   rH   s      r(   rK   zConstantList.__setitem__Q   s    25#r*   c                      t          d          )Nz"Cannot set item in a constant listr,   rJ   s      r(   rK   zConstantList.__setitem__T   r5   r*   c                      t          d          )Nz'Cannot delete item from a constant listr,   r.   s     r(   __delitem__zConstantList.__delitem__W   s    ABBBr*   c                 *    t          | j                  S r$   )iterr&   r;   s    r(   __iter__zConstantList.__iter__Z   s    DG}}r*   c                     || j         v S r$   r%   r.   s     r(   __contains__zConstantList.__contains__]   s    twr*   c                 *    t          | j                  S r$   )rA   r&   r;   s    r(   __len__zConstantList.__len__`   s    47||r*   c                     d| j          dS )NzConstantList()r%   r;   s    r(   __repr__zConstantList.__repr__c   s    )tw))))r*   c                 4    | j                                         S r$   )r&   copyr;   s    r(   r[   zConstantList.copyf   s    w||~~r*   )r   N)__name__
__module____qualname__listr   r)   r0   r2   r4   r7   r9   r<   intr@   r   rD   slicerK   rO   rR   rT   rV   rY   r[   rC   r*   r(   r    r    ,   s'       $q' d    < < <9 9 9> > >; ; ;> > >8 8 8V V! VC V3: V V V V V ..... X.6U6$q'666 X6e DG     33A333 X35U51555 X5>e >AQK > > > >C C C      * * *d1g      r*   r    c                       e Zd ZdZdddeej        z  dej        dej        de	de	d	d
fdZ
dded
z  d	ej        fdZdded
z  d	ej        fdZd
S )CpuGpuBufferz2Buffer to easily copy tensors between CPU and GPU.T)
with_numpysizedtypedevice
pin_memoryrd   r"   Nc                    t          j        ||d|d| _        t          j        | j        |          | _        |  |r?|t           j        k    rt          d          | j                                        | _        d S d S )Ncpu)rf   rg   rh   )rg   zkBfloat16 torch tensors cannot be directly cast to a numpy array, so call CpuGpuBuffer with with_numpy=False)	torchzerosrj   
zeros_likegpubfloat16
ValueErrornumpynp)r'   rf   rg   rh   rd   re   s         r(   r)   zCpuGpuBuffer.__init__m   s     ;E%JWWW#DHV<<<  	'&& N   hnn&&DGGG	' 	'r*   nc                     |!| j                             | j        d          S | j         d |                             | j        d |         d          S )NTnon_blocking)rn   copy_rj   r'   rs   s     r(   copy_to_gpuzCpuGpuBuffer.copy_to_gpu   sN    98>>$(>>>>x|!!$(2A2,T!BBBr*   c                     |!| j                             | j        d          S | j         d|                             | j        d|         d          S )zzNOTE: Because this method is non-blocking, explicit synchronization
        is needed to ensure the data is copied to CPU.NTru   )rj   rw   rn   rx   s     r(   copy_to_cpuzCpuGpuBuffer.copy_to_cpu   sP     98>>$(>>>>x|!!$(2A2,T!BBBr*   r$   )r\   r]   r^   __doc__r`   rk   SymIntrf   rg   boolr)   Tensorry   r{   rC   r*   r(   rc   rc   j   s        <<  ' ' 'U\!' {' 	'
 ' ' 
' ' ' ',C CS4Z C5< C C C C
C CS4Z C5< C C C C C Cr*   rc   
local_onlyhostportr"   c                 ^    | rt                      nt          ||pt                                S )a  Assign a new ZMQ socket address.

    If local_only is True, participants are colocated and so a unique IPC
    address will be returned.

    Otherwise, the provided host and port will be used to construct a TCP
    address (port == 0 means assign an available port).)r   r   r   )r   r   r   s      r(   get_engine_client_zmq_addrr      s3     	:$ 788r*   c                   n    e Zd ZdZ	 ddedededej        de	de
e         d	e
e         d
edz  fdZddZdS )APIServerProcessManagerzManages a group of API server processes.

    Handles creation, monitoring, and termination of API server worker
    processes. Also monitors extra processes to check if they are healthy.
    Ntarget_server_fnlisten_addresssockargsnum_serversinput_addressesoutput_addressesstats_update_addressc	           	      
   || _         || _        || _        t          j        d          }	g | _        t          t          |          ||          D ]a\  }
}}||||
d}|||d<   |	                    |d|
 ||||f          }| j        	                    |           |
                                 bt                              dt          | j                             t          j        | t           | j                  | _        dS )a7  Initialize and start API server worker processes.

        Args:
            target_server_fn: Function to call for each API server process
            listen_address: Address to listen for client connections
            sock: Socket for client connections
            args: Command line arguments
            num_servers: Number of API server processes to start
            input_addresses: Input addresses for each API server
            output_addresses: Output addresses for each API server
            stats_update_address: Optional stats update address
        spawn)input_addressoutput_addressclient_countclient_indexNr   
ApiServer_)targetnamer   zStarted %d API server processes)r   r   r   multiprocessingget_context	processesziprangeProcessr0   r=   loggerinforA   weakreffinalizeshutdown
_finalizer)r'   r   r   r   r   r   r   r   r   spawn_contextiin_addrout_addrclient_configprocs                  r(   r)   z APIServerProcessManager.__init__   s(   . -		 (3G<<,.$'+1A%
 %
 	 	 Aw ")"* + !	 M $/8L45 (('%!%%$dD-@ )  D
 N!!$'''JJLLLL5s4>7J7JKKK "*44>JJr*   r"   c                 .    |                                   d S r$   )r   r;   s    r(   closezAPIServerProcessManager.close   s    r*   r$   )r"   N)r\   r]   r^   r|   r   strr
   argparse	Namespacer`   r_   r)   r   rC   r*   r(   r   r      s          ,07K 7K"7K 7K 	7K
  7K 7K c7K s)7K "Dj7K 7K 7K 7Kr     r*   r   api_server_managerengine_manager)r   r   coordinatorr   c           	      ~   ddl m}m} 	 t                              d           d | j        D             }|r|j        ||j        j        <   g }t          ||          r|j        D ]}|||j        <   n$t          ||          r|	                                }|s|rt          j        |d          }|D ]I}	|                    |	          }|j        dk    r't          d|j         d|j         d	|j                   J|rdd
l}
|
                    |d          \  }}||n`# t$          $ r t                              d           Y n:t&          $ r.}t                              dt+          |                      d
}~ww xY wt                              d           |                                  |r|                                 |r|                                 d
S d
S # t                              d           |                                  |r|                                 |r|                                 w w xY w)a  Wait for all processes to complete or detect if any fail.

    Raises an exception if any process exits with a non-zero status.

    Args:
        api_server_manager: The manager for API servers.
        engine_manager: The manager for engine processes.
            If CoreEngineProcManager, it manages local engines;
            if CoreEngineActorManager, it manages all engines.
        coordinator: The coordinator for data parallel.
    r   r   z'Waiting for API servers to complete ...c                     i | ]
}|j         |S rC   )sentinel).0r   s     r(   
<dictcomp>z2wait_for_completion_or_failure.<locals>.<dictcomp>   s)     4
 4
 4
$(DM44
 4
 4
r*      )timeoutzProcess z (PID: z) died with exit code Nz8Received KeyboardInterrupt, shutting down API servers...z0Exception occurred while running API servers: %sz#Terminating remaining processes ...)vllm.v1.engine.utilsr   r   r   r   r   r   r   
isinstanceget_run_refsr   waitr7   exitcodeRuntimeErrorr   pidrayKeyboardInterrupt	Exception	exceptionr   r   )r   r   r   r   r   sentinel_to_procactor_run_refsr   ready_sentinelsr   r   _es                r(   wait_for_completion_or_failurer      s    $ SRRRRRRR2#=>>>4
 4
,>,H4
 4
 4
  	K:E:J[-67n&;<< 	;&0 7 726 //7(>?? 	;+88::N  	H. 	H)39IST)U)U)UO ,  '++H55 =A%%&?49 ? ?TX ? ?/3}? ?   &  H


$'HH^QH$G$G!>%  	H. 	H(  P P PNOOOOO   KSQRVVTTT 	9:::  """ 	  	#  """""	# 	#	 	9:::  """ 	  	#  """"	#s7   DD!  G !$E>G 	E>)E99E>>G AH<procsc                    | D ]*}|                                 r|                                 +t          j                    dz   }| D ]I}|t          j                    z
  }|dk    r n*|                                 r|                    |           J| D ].}|                                 r|j        x}t          |           /d S )Nr   r   )is_alive	terminatetime	monotonicjoinr   r   )r   r   deadline	remainingr   s        r(   r   r   /  s      ==?? 	NN ~!#H ! !t~///	>>E==?? 	!IIi    # #==?? 	#tx<c"""# #r*   from_tensor	to_tensorlengthc                 P    |d|                              | d|         d          S )z
    Copy the first length elements of a tensor into another tensor in a
    non-blocking manner.

    Used to copy pinned CPU tensor data to pre-allocated GPU tensors.

    Returns the sliced target tensor.
    NTru   )rw   )r   r   r   s      r(   
copy_slicer   C  s/     WfW##K$8t#LLLr*   usage_contextc           	      H   t                      sdS ddlm} | j        }d}| j        | j        j        }t          j         || j                  |i dt          | j        j
                  d| j        j        d| j        j        d| j        j        d| j        j        d	t          | j        j                  d
t#          | j                  d| j        j        d| j        j        d|j        d|j        d|j        d|j        d|j        d|j        d|           dS )z#Report usage statistics if enabled.Nr   )get_architecture_class_namerf   
block_sizegpu_memory_utilizationkv_cache_memory_bytesquantizationkv_cache_dtypeenable_loraenable_prefix_cachingenforce_eagerdisable_custom_all_reducetensor_parallel_sizedata_parallel_sizepipeline_parallel_sizeenable_expert_parallelall2all_backendkv_connector)	extra_kvs)r    vllm.model_executor.model_loaderr   parallel_configkv_transfer_configr   r   report_usagemodel_configr   rf   cache_configr   r   r   r   cache_dtyper~   lora_configr   r   r   r   r   r   r   r   )vllm_configr   r   r   r   s        r(   report_usage_statsr   Q  s   
 "## LLLLLL!1O L%1"5B##K$<==
S1788
 +2=
 %k&>&U	

 $[%=%S
 K4A
 c+":"FGG
 4 788
 $[%=%S
 [5C
 ()R
 #O$H
  !/"D!
" %o&L#
$ %o&L%
( >)
, L-
     r*   r   c                     t           t          |           S t          j        }t          j        rt
          }nt          j        rdd l}|j        }|a  ||           S )Nr   )	_PROFILER_FUNC
contextlibnullcontextenvs VLLM_CUSTOM_SCOPES_FOR_PROFILINGr   VLLM_NVTX_SCOPES_FOR_PROFILINGnvtxannotate)r   funcr   s      r(   record_function_or_nullcontextr     s`     !d###!D, 		, }N4::r*   tensorc                     |                                                                                      t          j                                                  j        S )zGet the raw data of a tensor as a uint8 memoryview, useful for
    serializing and hashing.

    Args:
        tensor: The input tensor.

    Returns:
        A memoryview of the tensor data as uint8.
    )flatten
contiguousviewrk   uint8rq   data)r   s    r(   tensor_datar    s?     >>&&((--ek::@@BBGGr*   c                   D    e Zd ZU eed<   eed<   eed<   eed<   defdZdS )IterationDetailsnum_ctx_requestsnum_ctx_tokensnum_generation_requestsnum_generation_tokensr"   c           	      H    d| j          d| j         d| j         d| j         d	S )Nz"IterationDetails(num_ctx_requests=z!,                 num_ctx_tokens=z+,                  num_generation_requests=z),                  num_generation_tokens=rX   )r
  r  r  r  r;   s    r(   rY   zIterationDetails.__repr__  s]    FD4I F F!%!4F F*.*FF F )-(BF F F 	Fr*   N)r\   r]   r^   r`   __annotations__r   rY   rC   r*   r(   r	  r	    sh             F# F F F F F Fr*   r	  scheduler_outputc                     d}d}d}d}d | j         D             }| j                                        D ]8\  }}| j                            |          s||v r|dz  }||z  }.|dz  }||z  }9t          ||||          S )a  
    Compute the number of context/generation requests and tokens
    for the current iteration's scheduler output. A requests is regarded
    as a context request if its output tokens are still 0, an extended chunk
    of chunked prefill falls into this category.

    Args:
        scheduler_output: The scheduler output for the current iteration.

    Returns:
        An IterationDetails object containing the number of
        context/generation requests and tokens.
    r   c                     h | ]	}|j         
S rC   )req_id)r   new_reqs     r(   	<setcomp>z,compute_iteration_details.<locals>.<setcomp>  s    UUUg7>UUUr*      )scheduled_new_reqsnum_scheduled_tokensitemsscheduled_cached_reqsis_context_phaser	  )r  num_context_requestsnum_context_tokensr  r  new_req_idsr  
num_tokenss           r(   compute_iteration_detailsr     s     UU1A1TUUUK.CIIKK 0 0
1BB6JJ 	0k!! A% *,#q(#!Z/!!	  r*   )r   )NN)Gr   r   r   r   r   collections.abcr   r   r   dataclassesr   r   multiprocessing.processr   typingr	   r
   r   r   r   r   r   rk   torch.autograd.profilerr   	vllm.envsr   vllm.loggerr   vllm.usage.usage_libr   r   r   vllm.utils.network_utilsr   r   r   vllm.utils.system_utilsr   vllm.v1.core.sched.outputr   rq   rr   vllm.v1.engine.coordinatorr   r   r   r   r\   r   r   r    rc   r~   r   r`   r   r   r   r_   r   r   r   ENGINE_CONTEXTr   r   r   
memoryviewr  r	  r   rC   r*   r(   <module>r/     sk              . . . . . . . . - - - - - - ! ! ! ! ! ! & & & & & & / / / / / /                   3 3 3 3 3 3       # # # # # # T T T T T T T T T T V V V V V V V V V V 5 5 5 5 5 5 5 5 5 5 5 5 S888888RRRRRRRR	X		GCLL; ; ; ; ;71:x ; ; ;|#C #C #C #C #C #C #C #CL 4 s # c     A A A A A A A AN -1	F# F#/F#KL
F# /*	F#
 
F# F# F# F#V#D% # # # #(MM*/,M@CM
\M M M M 0</J, , ,,	, , , ,^  1G    &
H 
H 
H 
H 
H 
H 
F 
F 
F 
F 
F 
F 
F 
F! !DT ! ! ! ! ! !r*   