
    -`iE                     j   d Z ddlZddlZddlZddlZddlZddlmZ ddlZddl	Z	ddl
mZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z&  ee'          Z( G d dej)                  Z)dej*        fdZ+d Z,e'dk    r e,             dS dS )a"  
vLLM gRPC Server

Starts a gRPC server for vLLM using the VllmEngine protocol.

Usage:
    python -m vllm.entrypoints.grpc_server --model <model_path>

Example:
    python -m vllm.entrypoints.grpc_server         --model meta-llama/Llama-2-7b-hf         --host 0.0.0.0         --port 50051
    N)AsyncGenerator)
reflection)SamplingParams
TextPromptTokensPrompt)AsyncEngineArgs)log_version_and_model)vllm_engine_pb2vllm_engine_pb2_grpc)init_logger)RequestOutput)RequestOutputKindStructuredOutputsParams)UsageContext)FlexibleArgumentParser)AsyncLLM)__version__c                       e Zd ZdZdedefdZdej        de	j
        j        deej        df         fd	Zdej        de	j
        j        dej        fd
Zdej        de	j
        j        dej        fdZdej        de	j
        j        dej        fdZdej        de	j
        j        dej        fdZdej        de	j
        j        dej        fdZe	 ddej        de defd            Z!ede"dej        fd            Z#ede"dej        fd            Z$dS )VllmEngineServicera(  
    gRPC servicer implementing the VllmEngine service.

    Handles 6 RPCs:
    - Generate: Streaming text generation
    - Embed: Embeddings (TODO)
    - HealthCheck: Health probe
    - Abort: Cancel requests out-of-band
    - GetModelInfo: Model metadata
    - GetServerInfo: Server state
    	async_llm
start_timec                 V    || _         || _        t                              d           dS )z
        Initialize the servicer.

        Args:
            async_llm: The AsyncLLM instance
            start_time: The server start time, in seconds since epoch
        zVllmEngineServicer initializedN)r   r   loggerinfo)selfr   r   s      p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/entrypoints/grpc_server.py__init__zVllmEngineServicer.__init__;   s+     #$455555    requestcontextreturnNc                x  K   |j         }t                              d|           	 |                    d          dk    r7dt	          |j        j                  i}|j        j        r|j        j        |d<   n	d|j        i}| 	                    |j
        |j                  }| j                            |||          2 3 d{V }|j        r|                     |          W V  |j        r|                     |          W V  F6 dS # t"          $ rC}|                    t&          j        j        t-          |                     d{V  Y d}~dS d}~wt.          $ r^}t                              d	|           |                    t&          j        j        t-          |                     d{V  Y d}~dS d}~ww xY w)
z
        Handle streaming generation requests.

        Args:
            request: The GenerateRequest protobuf
            context: gRPC context

        Yields:
            GenerateResponse protobuf messages (streaming)
        zGenerate request %s received.input	tokenizedprompt_token_idsprompt)stream)r&   sampling_params
request_idNz Error in Generate for request %s)r)   r   debug
WhichOneoflistr$   	input_idsoriginal_texttext_sampling_params_from_protor(   r'   r   generate_chunk_responsefinished_complete_response
ValueErrorabortgrpc
StatusCodeINVALID_ARGUMENTstr	Exception	exceptionINTERNAL)r   r   r    r)   r&   r(   outputes           r   GeneratezVllmEngineServicer.GenerateG   s9      '
4jAAA#	B!!'**k99&W->-H(I(I( $2 G'.'8'FF8$&.%= #>>' ?  O !% 7 7 /% !8 ! ! : : : : : : :f > 7..v666666 ? :11&999999! ! !  	J 	J 	J-- @#a&&IIIIIIIIIIIIIII 	B 	B 	B?LLL-- 8#a&&AAAAAAAAAAAAAAA	Bs2   BD =DAD 
F98EF9AF44F9c                    K   t                               d           |                    t          j        j        d           d{V  dS )z
        Handle embedding requests.

        TODO: Implement in Phase 4

        Args:
            request: The EmbedRequest protobuf
            context: gRPC context

        Returns:
            EmbedResponse protobuf
        zEmbed RPC not yet implementedN)r   warningr6   r7   r8   UNIMPLEMENTED)r   r   r    s      r   EmbedzVllmEngineServicer.Embed~   sd      " 	6777mmO)+J
 
 	
 	
 	
 	
 	
 	
 	
 	
 	
r   c                    K   | j         j         }|rdnd}t                              d||           t	          j        ||          S )z
        Handle health check requests.

        Args:
            request: The HealthCheckRequest protobuf
            context: gRPC context

        Returns:
            HealthCheckResponse protobuf
        HealthzEngine is not alivez+HealthCheck request: healthy=%s, message=%s)healthymessage)r   erroredr   r*   r
   HealthCheckResponse)r   r   r    
is_healthyrH   s        r   HealthCheckzVllmEngineServicer.HealthCheck   sR       //
(C((.CBJPWXXX2:wWWWWr   c                    K   |j         }t                              d|           | j                            |           d{V  t          j                    S )z
        Out-of-band abort requests.

        Args:
            request: The AbortRequest protobuf
            context: gRPC context

        Returns:
            AbortResponse protobuf
        zAbort requests: %sN)request_idsr   r*   r   r6   r
   AbortResponse)r   r   r    rN   s       r   AbortzVllmEngineServicer.Abort   s\       ));777n"";/////////,...r   c                    K   | j         j        }t          j        |j        |j        dk    |j        |                                |j                  S )z
        Handle model info requests.

        Args:
            request: The GetModelInfoRequest protobuf
            context: gRPC context

        Returns:
            GetModelInfoResponse protobuf
        r1   )
model_pathis_generationmax_context_length
vocab_sizesupports_vision)	r   model_configr
   GetModelInfoResponsemodelrunner_typemax_model_lenget_vocab_sizeis_multimodal_model)r   r   r    rW   s       r   GetModelInfozVllmEngineServicer.GetModelInfo   sX       ~23#)&2j@+9#2244(<
 
 
 	
r   c                    K   | j         j                                        }t          j        |dt          j                    t          j                    | j        z
  d          S )z
        Handle server info requests.

        Args:
            request: The GetServerInfoRequest protobuf
            context: gRPC context

        Returns:
            GetServerInfoResponse protobuf
        Fz	vllm-grpc)active_requests	is_pausedlast_receive_timestampuptime_secondsserver_type)r   output_processorget_num_unfinished_requestsr
   GetServerInfoResponsetimer   )r   r   r    num_requestss       r   GetServerInfoz VllmEngineServicer.GetServerInfo   sZ       ~6RRTT4(#'9;;9;;8#
 
 
 	
r   Tparamsr'   c                 T   | j         rt          | j                   nd}| j        rt          | j                  nd}d}|                     d          }|r|dk    rt	          | j                  }n|dk    rt	          | j                  }n|dk    rt	          | j                  }ne|d	k    rt	          | j        
          }nI|dk    rt	          | j	                  }n-|dk    r't	          t          | j
        j                            }t          d+i d|                     d          r| j        ndd| j        dk    r| j        ndd| j        d| j        d| j        d| j        d| j        dk    r| j        ndd|                     d          r| j        ndd| j        d|d|d| j        d| j        d| j        d| j        d k    r| j        nd!d"|                     d"          r| j        ndd#|                     d#          r| j        ndd$|                     d$          r| j        ndd%| j        d&| j        rtA          | j                  ndd'|                     d'          r| j!        ndd(|d)tE          |          d*|rtF          j$        ntF          j%        S ),a  
        Convert protobuf SamplingParams to vLLM SamplingParams.

        Args:
            params: Protobuf SamplingParams message
            stream: Whether streaming is enabled

        Returns:
            vLLM SamplingParams with detokenize=False and structured_outputs
        N
constraintjson_schema)jsonregex)rp   grammar)rq   structural_tag)rr   json_object)rs   choice)rt   temperatureg      ?top_pg        top_kmin_pfrequency_penaltypresence_penaltyrepetition_penalty
max_tokens
min_tokensstopstop_token_idsskip_special_tokensspaces_between_special_tokens
ignore_eosnr      logprobsprompt_logprobsseedinclude_stop_str_in_output
logit_biastruncate_prompt_tokensstructured_outputs
detokenizeoutput_kind )&r~   r,   r   r+   r   rn   rp   rq   rr   rs   rt   choicesr   HasFieldru   rv   rw   rx   ry   rz   r{   r|   r}   r   r   r   r   r   r   r   r   r   dictr   boolr   DELTA
FINAL_ONLY)rk   r'   r~   r   r   constraint_fields         r   r0   z.VllmEngineServicer._sampling_params_from_proto   s    %+K9tFK   T8>8MWf3444SW "!,,\:: 	=00%<&BT%U%U%U""!W,,%<6<%P%P%P""!Y..%<V^%T%T%T""!%555%<#)#8& & &"" "]22%< & 2& & &"" "X--%< 566& & &"  "
 "
 "
.4oom.L.LU**RU"
"(,#"5"5&,,3"
 ,,"
 ,,	"

 %66"
 $44"
 (C//  &88"
 -3OOL,I,ISv((t"
 (("
 "
 *>"
 !' : :"
 +1*N*N"
  ((!"
" !(Q,,fhhA#"
$ )/
(C(CMV__%"
( 011F22+"
, !' 7 7AT-"
. (.'H'H/"
0 392CMtF-...1"
4 788$6#@#@7"
8  219"
< Dzzz="
@ .)//"-C"
 "	
r   r>   c           	      L   | j         r| j         d         nd}|+t          j        t          j        g ddd                    S t          j        t          j        |j        | j        rt          | j                  ndt          |j                  | j                            S )a5  
        Build a streaming chunk response from vLLM output.
        When output_kind=DELTA, vLLM returns only new tokens automatically.

        Args:
            output: vLLM RequestOutput (with delta tokens when output_kind=DELTA)

        Returns:
            GenerateResponse with chunk field set
        r   N)	token_idsprompt_tokenscompletion_tokenscached_tokens)chunk)outputsr
   GenerateResponseGenerateStreamChunkr   r%   lennum_cached_tokensr>   
completions     r   r2   z"VllmEngineServicer._chunk_responseB  s     +1.BV^A&&d
"3%9 "#&'"#	      /!5$.*c&"9:::"%j&:";";$6  	
 	
 	
 		
r   c           
      ^   | j         r| j         d         nd}|,t          j        t          j        g dddd                    S t          j        t          j        |j        |j        pd| j        rt          | j                  ndt          |j                  | j                            S )z
        Build a final completion response from vLLM output.

        Args:
            output: vLLM RequestOutput (finished=True)

        Returns:
            GenerateResponse with complete field set
        r   Nerror)
output_idsfinish_reasonr   r   r   )completer~   )	r   r
   r   GenerateCompleter   r   r%   r   r   r   s     r   r4   z%VllmEngineServicer._complete_responsej  s     +1.BV^A&&d
"3(9!")"#&'"#      /$5%/(6@&*c&"9:::"%j&:";";$6  

 

 

 
	
r   )T)%__name__
__module____qualname____doc__r   floatr   r
   GenerateRequestr7   aioServicerContextr   r   r@   EmbedRequestEmbedResponserD   HealthCheckRequestrJ   rL   AbortRequestrO   rP   GetModelInfoRequestrX   r^   GetServerInfoRequestrg   rj   staticmethodr   r   r0   r   r2   r4   r   r   r   r   r   .   sj       
 

6( 
6 
6 
6 
6 
65B 05B )5B 
8$>	?	5B 5B 5B 5Bn
 -
 )
 
	&	
 
 
 
,X 3X )X 
	,	X X X X,/ -/ )/ 
	&	/ / / /*
 4
 )
 
	-	
 
 
 
2
 5
 )
 
	.	
 
 
 
6 ?CL
 L
.L
8<L
	L
 L
 L
 \L
\ %
 %
/2R %
 %
 %
 \%
N '
= '
_5U '
 '
 '
 \'
 '
 '
r   r   argsc                   K   t          t          t          | j                   t                              d|            t          j                    }t          j        |           }|                    t          j
                  }t          j        |t          j
        | j        | j                  }t          ||          }t           j                            ddg          }t'          j        ||           t*          j        j        d         j        t2          j        f}t3          j        ||           | j         d| j         }|                    |           |                                 d	{V  t                              d
|           t                              d           tA          j!                    }	tA          j"                    fd}
tF          j$        tF          j%        fD ]}|	&                    ||
           	 '                                 d	{V  n*# tP          $ r t                              d           Y nw xY wt                              d           |)                    d           d	{V  t                              d           |*                                 t                              d           t                              d           d	S # t                              d           |)                    d           d	{V  t                              d           |*                                 t                              d           t                              d           w xY w)zW
    Main serving function.

    Args:
        args: Parsed command line arguments
    zvLLM gRPC server args: %s)usage_context)vllm_configr   enable_log_requestsdisable_log_stats)zgrpc.max_send_message_length)zgrpc.max_receive_message_lengthr   )options
VllmEngine:NzvLLM gRPC server started on %sz"Server is ready to accept requestsc                  d    t                               d                                             d S )NzReceived shutdown signal)r   r   set)
stop_events   r   signal_handlerz"serve_grpc.<locals>.signal_handler  s,    .///r   zInterrupted by userz!Shutting down vLLM gRPC server...g      @)gracezgRPC server stoppedzAsyncLLM engine stoppedzShutdown complete)+r	   r   VLLM_VERSIONrY   r   rh   r   from_cli_argscreate_engine_configr   OPENAI_API_SERVERr   from_vllm_configr   disable_log_stats_serverr   r7   r   serverr    add_VllmEngineServicer_to_serverr
   
DESCRIPTORservices_by_name	full_namer   SERVICE_NAMEenable_server_reflectionhostportadd_insecure_portstartasyncioget_running_loopEventsignalSIGTERMSIGINTadd_signal_handlerwaitKeyboardInterruptr~   shutdown)r   r   engine_argsr   r   servicerr   service_namesaddressloopr   sigr   s               @r   
serve_grpcr     s~      &,
;;;
KK+T222J "/55K 22"4 3  K
 )"4 47	  I ")Z88H X__03
   F 9(FKKK 	"3LAKM 'v>>> ((TY((G
W%%% ,,..
KK0':::
KK4555 #%%DJ     . 5 5^4444)oo + + +)*****+ 	7888 kkk$$$$$$$$$)*** 	-...'((((( 	7888 kkk$$$$$$$$$)*** 	-...'((((s+   H# "K' #$I
K' 	I

K' 'BNc                     t          d          } |                     dt          dd           |                     dt          dd	           |                     d
dd           t	          j        |           } |                                 }	 t          j        t          |                     dS # t          $ r:}t                              d|           t          j        d           Y d}~dS d}~ww xY w)zMain entry point.zvLLM gRPC Server)descriptionz--hostz0.0.0.0zHost to bind gRPC server to)typedefaulthelpz--porti  zPort to bind gRPC server toz--disable-log-stats-server
store_truez$Disable stats logging on server side)actionr   zServer failed: %sr   N)r   add_argumentr:   intr   add_cli_args
parse_argsuvlooprunr   r;   r   r<   sysexit)parserr   r?   s      r   mainr     s,   #&  F
 *	     *	     $3     )&11FD
:d##$$$$$   ,a000s   !B1 1
C5;/C00C5__main__)-r   argparser   r   r   rh   collections.abcr   r7   r   grpc_reflection.v1alphar   vllmr   r   r   vllm.engine.arg_utilsr   vllm.entrypoints.utilsr	   	vllm.grpcr
   r   vllm.loggerr   vllm.outputsr   vllm.sampling_paramsr   r   vllm.usage.usage_libr   vllm.utils.argparse_utilsr   vllm.v1.engine.async_llmr   vllm.versionr   r   r   r   r   	Namespacer   r   r   r   r   <module>r
     s  
     



  * * * * * *   . . . . . . 9 9 9 9 9 9 9 9 9 9 1 1 1 1 1 1 8 8 8 8 8 8 ; ; ; ; ; ; ; ; # # # # # # & & & & & & K K K K K K K K - - - - - - < < < < < < - - - - - - 4 4 4 4 4 4	X		d
 d
 d
 d
 d
-@ d
 d
 d
NU)8- U) U) U) U)p# # #L zDFFFFF r   