
    .`i)              	          U d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
mZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lm Z  d dl!m"Z" d d
l#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZGmHZHmIZI d dlJmKZKmLZLmMZMmNZN d dlOmPZPmQZQmRZRmSZS d dlTmUZUmVZVmWZWmXZX d dlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ d dl`maZambZbmcZc d dldmeZemfZf d dlgmhZhmiZimjZjmkZk d dllmmZmmnZn d dlompZp d dlqmrZrmsZs d d ltmuZumvZvmwZw d d!lxmyZy d d"lzm{Z{m|Z| d d#l}m~Z~ d d$lmZ d d%lmZmZmZ d d&lmZ d d'lmZmZ d d(lmZ d d)lmZmZ d d*lmZ d d+lmZmZ d d,lmZmZmZ d d-lmZ d d.lmZmZmZ d d/lmZ  G d0 d1e          Z eye          Ze3ejz  ehz  eQz  eLz  eZz  e]z  eWz  Zeed2<   e0eiz  ePz  eKz  eVz  Zeed3<   eGeIz  Zeed4<   eez  ez  eCz  eUz  eez  Zeed5<   e4e1z  eSz  eHz  ekz  eXz  eNz  e^z  efz  Zeed6<    ed7e8          Z ed9:           G d; d<                      Z ed9:           G d= d>                      Z ed9:           G d? d@eeee                               Z ed9:           G dA dBeeM                               Z ed9:           G dC dDeeR                               Z G dE dF          ZdGe|dz  dHe|dz  fdIZdS )J    N)AsyncGeneratorCallableIterableMapping)	dataclassfield)
HTTPStatus)AnyClassVarGeneric	TypeAliasTypeVarcast)Request)ToolChoiceFunction)
ConfigDictTypeAdapter)Headers)BeamSearchSequencecreate_sort_beams_key_function)EngineClient)ChatCompletionMessageParamChatTemplateContentFormatOptionConversationMessage)RequestLogger)"ChatCompletionNamedToolChoiceParamChatCompletionRequestChatCompletionResponse)CompletionRequestCompletionResponse)	ErrorInfoErrorResponseFunctionCallFunctionDefinition)OpenAIServingModels)ConversationContextHarmonyContextParsableContextStreamingHarmonyContext)ResponseInputOutputItemResponsesRequest)construct_input_messages)TranscriptionRequestTranscriptionResponseTranslationRequest)ClassificationChatRequestClassificationCompletionRequestClassificationRequestClassificationResponse)EmbeddingChatRequestEmbeddingCompletionRequestEmbeddingRequestEmbeddingResponse)IOProcessorRequestPoolingChatRequestPoolingCompletionRequestPoolingResponse)RerankRequestScoreDataRequestScoreQueriesDocumentsRequestScoreRequestScoreResponseScoreTextRequest)BaseRendererCompletionRendererRenderConfig)GenerateRequestGenerateResponse)DetokenizeRequestTokenizeChatRequestTokenizeCompletionRequestTokenizeResponse)_validate_truncation_sizesanitize_messageVLLMValidationError)
PromptTypeTokensPrompt)PromptComponentsget_prompt_components"is_explicit_encoder_decoder_prompt)init_logger)LogprobPromptLogprobs)LoRARequest)MultiModalDataDict)CompletionOutputPoolingRequestOutputRequestOutput)PoolingParams)ReasoningParserReasoningParserManager)RendererLike)BeamSearchParamsSamplingParams)TokenizerLike)
ToolParserToolParserManager)contains_trace_headersextract_trace_headerslog_tracing_disabled_warning)random_uuid)AsyncMicrobatchTokenizercollect_from_async_generatormerge_async_iterators)EngineCoreRequestc                   *     e Zd ZdZddef fdZ xZS )GenerationErrorz?raised when finish_reason indicates internal server error (500)Internal server errormessagec                 l    t                                          |           t          j        | _        d S N)super__init__r	   INTERNAL_SERVER_ERRORstatus_code)selfrp   	__class__s     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/entrypoints/openai/engine/serving.pyrt   zGenerationError.__init__   s,    !!!%;    )ro   )__name__
__module____qualname____doc__strrt   __classcell__rx   s   @ry   rn   rn      sO        II< < < < < < < < < < < <rz   rn   CompletionLikeRequestChatLikeRequestSpeechToTextRequest
AnyRequestAnyResponseRequestT)boundT)kw_onlyc                   H    e Zd ZU dZ ee          Zee         dz  ed<   dS )RequestProcessingMixinzY
    Mixin for request processing,
    handling prompt preparation and engine input.
    default_factoryNengine_prompts)	r{   r|   r}   r~   r   listr   rP   __annotations__ rz   ry   r   r      sE          
 16d0K0K0KND&-KKKKKrz   r   c                       e Zd ZU dZdZeeeee	z  f         df         dz  e
d<    ee          Zeee	z           e
d<    ed          ZdS )ResponseGenerationMixinz`
    Mixin for response generation,
    managing result generators and final batch results.
    Nresult_generatorr   final_res_batchT)arbitrary_types_allowed)r{   r|   r}   r~   r   r   tupleintr[   rZ   r   r   r   r   r   model_configr   rz   ry   r   r      s           	 uS-2F"FFGMNQUU   CH%C C COT-*>>?    :d;;;LLLrz   r   c                   z    e Zd ZU eed<   dZedz  ed<   eed<   eed<    ed           Z	e
ed<   dZedz  ed	<   dS )
ServeContextrequestNraw_request
model_name
request_idc                  B    t          t          j                              S rr   )r   timer   rz   ry   <lambda>zServeContext.<lambda>   s    c$)++6F6F rz   r   created_timelora_request)r{   r|   r}   r   r   r   r   r   r   r   r   r   rW   r   rz   ry   r   r      sz         "&K4&&&OOOOOO.F.FGGGL#GGG'+L+$+++++rz   r   c                       e Zd ZdS )ClassificationServeContextN)r{   r|   r}   r   rz   ry   r   r      s        Drz   r   c                   .    e Zd ZU dZedz  ed<   eed<   dS )EmbeddingServeContextNchat_templatechat_template_content_format)r{   r|   r}   r   r   r   r   r   rz   ry   r   r      s2          $M3:$$$"AAAAAArz   r   c                    N    e Zd ZU dZee         ed<   ddddedede	dz  d	e
d
e
f
 fdZ	 dqdedz  de
deegef         dz  fdZdedeegef         dz  fdZdrdZ	 	 dsdededededz  deeef         dz  deedf         fdZdefdZdedefdZdefdZ de!de"dz  fdZ#de!de$e"z  fdZ%de!de$e"z  fd Z&de!dee$e"z  df         fd!Z'de!de"dz  fd"Z(de!de)e"z  fd#Z*de!de"dz  fd$Z+de!de"dz  fd%Z,d&e-j.        dfd'ee/z  d(ed)e-d*edz  de"f
d+Z0d&e-j.        dfd'ee/z  d(ed)e-d*edz  def
d,Z1d-edz  deddfd.Z2d/e3de"fd0Z4d/e3defd1Z5de6de"dz  fd2Z7de6dedz  fd3Z8	 dtde6d4e
dedz  fd5Z9de6de:e         fd6Z;de6ded7ed8e
de<f
d9Z=de6d:e>e?         d7edz  de<fd;Z@de6d<e>e?         d=ede<fd>ZA	 dude6d7ed@ee>e?         z  d8e
de<f
dAZB	 dude6d7edBeCee>e?         z           d8e
dee<df         f
dCZDdDedz  dEeEeef         dz  dFe
de"dz  fdGZFeG	 	 dsdHeEeef         dz  dIeEeef         dz  deEeef         fdJ            ZH	 	 	 	 	 	 	 	 dvdeIeJz  dKeKdLe>eL         dMedz  dNeMdOe
dPe
dQe>eEeef                  dz  dRe>eEeef                  dz  dEeEeef         dz  dIeEeef         dz  dSeegef         dz  d8e
deNe>eO         e>e<         f         fdTZPddUdedVedeQe)z  dedz  deeef         dz  dWe?dXe?dz  deNeReEeef         f         fdYZSdeJdKeKdLe>eT         dQe>eEeef                  dz  dMedz  dNeMfdZZU	 	 dwdedVe<d\eQd]eVdedz  dWe?fd^ZWdedeXfd_ZYded`edeQe)z  ez  dz  dedz  ddf
daZZdbe[deeef         dz  fdcZ\eG	 dxdde]dz  deedz  dedz  fdf            Z^eGdde]dz  de?dz  fdg            Z_eG	 dxdeJe`z  d7edz  de
dheegef         dz  diedz  deNe>ea         dz  edz  f         fdj            ZbeG	 dtdkecdle?d7edz  dme
def
dn            Zddoedz  de
fdpZe xZfS )yOpenAIServingu   
    A short string prepended to every request’s ID (e.g. "embd", "classify")
    so you can easily tell “this ID came from Embedding vs Classification.”
    request_id_prefixF)return_tokens_as_token_idslog_error_stackengine_clientmodelsrequest_loggerNr   r   c                F   t                                                       || _        || _        || _        || _        i | _        || _        | j        j        | _        | j        j	        | _	        | j        j
        | _
        | j        j        | _        | j        j        | _        d S rr   )rs   rt   r   r   r   r   _async_tokenizer_poolr   input_processorio_processorrendererr   max_model_len)rw   r   r   r   r   r   rx   s         ry   rt   zOpenAIServing.__init__   s     	*,*D'TV".#{: K4, K4!.<rz   tool_parser_nameenable_auto_toolsreturnc                 @   d}|r||S t                               d           	 |dk    r9| j        j                            d          rt                               d           t          j        |          }n&# t          $ r}t          d| d          |d}~ww xY w|S )z&Get the tool parser based on the name.Nz$"auto" tool choice has been enabled.pythoniczmeta-llama/Llama-3.2z>Llama3.2 models may struggle to emit valid pythonic tool callsz7Error: --enable-auto-tool-choice requires tool_parser:'z' which has not been registered)
loggerinfor   model
startswithwarningrd   get_tool_parser	Exception	TypeError)rw   r   r   parseres        ry   _get_tool_parserzOpenAIServing._get_tool_parser
  s       	$4$<M:;;;	:--$2C2I2T2T&3 3- T   '67GHHFF 	 	 	" 0" " "  		 s   AA8 8
BBBreasoning_parser_namec                     d}|sdS 	 t          j        |          }|J n&# t          $ r}t          d|d          |d}~ww xY w|S )z+Get the reasoning parser based on the name.Nzreasoning_parser_name=z has not been registered)r^   get_reasoning_parserr   r   )rw   r   r   r   s       ry   _get_reasoning_parserz#OpenAIServing._get_reasoning_parser#  s    
 $ 	4	X+@AVWWF%%%% 	X 	X 	XO4OOOPPVWW	Xs   ! 
A?Ac                 z   K   | j                                          | j                                         d {V  d S rr   )r   clear_mm_cacher   reset_mm_cacherw   s    ry   r   zOpenAIServing.reset_mm_cache2  sH      ++--- //11111111111rz   promptr   paramsr   trace_headersc                  12K   |j         }|j        }|j        }|j        }	|j        }
|j        }| j        }|j        }|t          ddd          |j	        }t          |          rt          t          |t                    r|}g }d }n@|                    d          }|                    dg           }|                    d          }d }t          |          2t!          ||
          }d|z  }t#          |d	|	
          }t%          |dg |||          g}g }t'          |          D ]}t)          d |D              \  }}g }| dt+                       }t-          t)          ||                    D ]b\  }\  }}| d| } t/          j        t3          | j                            ||| ||                              }!|                    |!           cd t/          j        |  d {V D             }"g }#g }$g }%t-          |"          D ]\  }}&||         1|&j        d         j        dk    r0tA          ||tC          ddg d d d          gd|d           W V    d S |&j        d         j"        |&j        d         j"        d         }'|$#                    tI          |'%                                                     |%#                    1fd|'&                                D                        tO          j(        |$          }$tO          j(        |%          }%|stO          j)        |$|k              d         }(|(D ]})||)|z           1|"|)|z           }&|&j        d         j"        J |&j        d         j"        d         }*|                    t%          |r1j*        |gz   n1j*        1j"        |*gz   tW          |%|)                   d|                     tN          j,         |%|(<   tO          j-        tO          j.        |%          |          d |         }+|+D ]})||)|z           1|"|)|z           }&t_          |$|)                   },|&j        d         j"        J |&j        d         j"        d         }*|#                    t%          1j*        |,gz   1j"        |*gz   1j0        tW          |%|)                   1j1        1j2                             |#}|#                    |           tg          ||d          }-|-d |         }.|.D ]N}/|/j*        d         |k    r|s|/j*        2d         }0n|/j*        2d          }0|4                    |0          |/_5        OtA          ||2fdt-          |.          D             d|d           W V  d S )Nz:You cannot use beam search when `skip_tokenizer_init=True`skip_tokenizer_initT	parametervaluer   prompt_token_idsmulti_modal_data      )logprobs
max_tokenstemperaturer   )tokenscum_logprobr   r   mm_processor_kwargsr   c                 ^    g | ]*}t          |j        |j        |j                   |j        f+S ))r   r   r   )rP   r   r   r   r   ).0beams     ry   
<listcomp>z-OpenAIServing.beam_search.<locals>.<listcomp>  sW     
 
 
  %-1[-1-B040H  
 )
 
 
rz   -z-beam-)r   r   c                     g | ]
}|d          S )r   r   )r   xs     ry   r   z-OpenAIServing.beam_search.<locals>.<listcomp>  s    AAAqadAAArz   error )indextext	token_idscumulative_logprobr   finish_reason)r   r   outputsfinishedr   prompt_logprobsc                 .    g | ]}j         |j        z   S r   )r   logprob)r   objcurrent_beams     ry   r   z-OpenAIServing.beam_search.<locals>.<listcomp>  s3        # )4s{B  rz   stop)r   r   r   r   stop_reason)r   r   r   r   r   r   )keyreversec                     g | ]J\  }}t          |j        |j        |j        d          ||j        |j        |j        nd|j                  KS )Nlength)r   r   r   r   r   r   r   )rY   r   r   r   r   r   r   )r   ir   tokenized_lengths      ry   r   z-OpenAIServing.beam_search.<locals>.<listcomp>  s        Q !'+'7"k*:*;*;<!])5 #'"4"4! $ 0
 
 
  rz   )6
beam_widthr   
ignore_eosr   length_penaltyinclude_stop_str_in_outputr   	tokenizerrN   eos_token_idrS   NotImplementedError
isinstancer   getlenr   ra   r   rangeziprh   	enumerateasynciocreate_taskrj   r   generateappendgatherr   r   r[   rY   r   extendr   keysvaluesnparraywherer   floatinfargpartitionnegativer   r   r   r   sorteddecoder   )3rw   r   r   r   r   r   r  r   r  r   r  r  r   r  r  prompt_textr   r   r   sort_beams_keylogprobs_numbeam_search_params	all_beams	completed_prompts_batchlora_req_batchtasksrequest_id_batchr   individual_promptlora_reqrequest_id_itemtaskoutput	new_beamsall_beams_token_idall_beams_logprobresultr   eos_idxidxlogprobs_entrytopn_idxtoken_idsorted_completed
best_beamsr   r   r   r   s3                                                    @@ry   beam_searchzOpenAIServing.beam_search6  s*      &
&
&
(.%+%F".#-	%L/    &2-f55 	&%%
 fc"" 	> K!# **X..K%zz*<bAA%zz*<==59 /007nUU:~+!#
 
 
 '!1$7)  	
	 	z"" A	" A	"A,/
 
 !*
 
 
-)M> E",>>{}}>>4=M>225 5 # #00%x &6"@"@Q"@"@*0*33-.+)1*7 4   
 
 T""""AA'.%*@$@$@$@$@$@$@AAAFI!# !#&v.. ! !	6(| >!$2g=='#-*,&'%'*,37)-.5  	! "&)9(,      " FFF>!$-9%~a09!<H&--d8==??.C.CDDD%,,   '/'8'8     "$*<!=!= "): ; ; 5(#5#EFFqI"  C#,SL-@#AL#C<$78F!>!,5AAA%+^A%6%?%BN$$*9$5<#6,#G#G!-!4%1%:n=M%M(-.?.D(E(E*0(4  
 
 
 
 /1fW!'* r{3D'E'EzRRH    ()<=| 341#677~a(1===!'!2!;A!>  &+2hZ?!-!6.9I!I%1%>$)*;C*@$A$A)5)F,8,L  	 	 	 	 "II###!)NNN%kzk2
 	1 	1D{2,..z.%5b%89%5%6%67!((00DII!    "+:!6!6   - '
 
 
 	
 	
 	
 	
 	
 	
rz   c                 N    t          | j        | j        j        | j                  S )z
        Get a Renderer instance with the provided tokenizer.
        Uses shared async tokenizer pool for efficiency.
        )r   r  async_tokenizer_pool)rC   r   r   r  r   r   s    ry   _get_completion_rendererz&OpenAIServing._get_completion_renderer%  s/    
 "*m-!%!;
 
 
 	
rz   r   c                     t           )a  
        Build and return a `RenderConfig` for an endpoint.

        Used by the renderer to control how prompts are prepared
        (e.g., tokenization and length handling). Endpoints should
        implement this with logic appropriate to their request type.
        )r  )rw   r   s     ry   _build_render_configz"OpenAIServing._build_render_config0  s
     "!rz   c                 p    | j                             |          }|t          |          }|| j         |<   |S )zh
        Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
        given tokenizer.
        )r   r	  ri   )rw   r  async_tokenizers      ry   _get_async_tokenizerz"OpenAIServing._get_async_tokenizer=  s@    
 488CC"6yAAO4CD&y1rz   ctxc                 
   K   dS )z
        Default preprocessing hook. Subclasses may override
        to prepare `ctx` (classification, embedding, etc.).
        Nr   rw   rC  s     ry   _preprocesszOpenAIServing._preprocessH  s       trz   c                 ,    |                      d          S )z
        Default response builder. Subclass may override this method
        to return the appropriate response object.
        zunimplemented endpoint)create_error_responserE  s     ry   _build_responsezOpenAIServing._build_responseR  s     ))*BCCCrz   c                 v   K   |                      |          }|2 3 d {V }|c S 6 |                     d          S )Nz!No response yielded from pipeline)	_pipelinerH  )rw   rC  
generationresponses       ry   handlezOpenAIServing.handle\  sf      
 ^^C((
( 	 	 	 	 	 	 	(OOO ) ))*MNNNs   %c                  K   |                      |j                   d{V x}r|W V  |                     |          x}r|W V  |                     |           d{V }t	          |t
                    r|W V  |                     |           d{V }t	          |t
                    r|W V  |                     |           d{V }t	          |t
                    r|W V  |                     |          W V  dS )z;Execute the request processing pipeline yielding responses.N)	_check_modelr   _validate_requestrF  r  r"   _prepare_generators_collect_batchrI  )rw   rC  r   preprocess_retgenerators_retcollect_rets         ry   rK  zOpenAIServing._pipelineh  s]     
 ++CK8888888885 	KKKK**3///5 	KKKK#//44444444nm44 	!     #77<<<<<<<<nm44 	!      //44444444k=11 	""3''''''''rz   c                 v    t          |j        dd           }| || j        k    r|                     d          S d S )Ntruncate_prompt_tokenszetruncate_prompt_tokens value is greater than max_model_len. Please, select a smaller truncation size.)getattrr   r   rH  )rw   rC  rX  s      ry   rQ  zOpenAIServing._validate_request  sR    !(6NPT!U!U #.&);;;--=  
 trz   c                     t          |j        d          s|                     d          S |j                                        S )Nto_pooling_paramsz0Request type does not support pooling parameters)hasattrr   rH  r[  rE  s     ry   _create_pooling_paramsz$OpenAIServing._create_pooling_params  sJ     s{$788 	--B   {,,...rz   c                   K   g }	 |j         dn$|                     |j         j                   d{V }|                     |          }t	          |t
                    r|S |j        |                     d          S t          |j                  D ]}\  }}|j	         d| }| 
                    ||||j                   | j                            ||||j        |t          |j        dd                    }|                    |           ~t#          | |_        dS # t&          $ r}	|                     |	          cY d}	~	S d}	~	ww xY w)z2Schedule the request and get the result generator.NEngine prompts not availabler   r   r   priorityr   r   r   ra  )r   _get_trace_headersheadersr]  r  r"   r   rH  r  r   _log_inputsr   r   encoderY  r   r  rk   r   r   )
rw   rC  
generatorsr   pooling_paramsr   engine_promptr,  	generatorr   s
             ry   rR  z!OpenAIServing._prepare_generators  s       	(	1 ?* 223?3JKKKKKKKK  "88==N.-88 &%%!)112PQQQ$-c.@$A$A - - =%(^"9"9a"9"9  #!)!$!1	 !    !.55!"#!$!1"/$S[*a@@ 6  	 !!),,,,#8*#EC 4 	1 	1 	1--a00000000	1s+   AD  D <B D 
E(E<EEc                   K   	 |j         |                     d          S t          |j                   }dg|z  }|j        |                     d          S |j        2 3 d{V \  }}|||<   6 d|v r|                     d          S d |D             |_        dS # t
          $ r}|                     |          cY d}~S d}~ww xY w)z0Collect batch results from the result generator.Nr_  zResult generator not availablez*Failed to generate results for all promptsc                     g | ]}||S rr   r   )r   ress     ry   r   z0OpenAIServing._collect_batch.<locals>.<listcomp>  s    "U"U"U3S_3___rz   )r   rH  r
  r   r   r   )rw   rC  num_promptsr   r   rm  r   s          ry   rS  zOpenAIServing._collect_batch  s,     
	1!)112PQQQc011K#f{2O#+112RSSS # 4 ) ) ) ) ) ) )fa%("" !5 &&11@   #V"U/"U"U"UC4 	1 	1 	1--a00000000	1s:   B 5B B A-##B B 
C$B>8C>CBadRequestErrorrp   err_typerv   paramc                    d }t          |t                    r|}ddlm} t          ||          rd}t          j        }|j        }nt          |t          t          t          t          f          rd}t          j        }d }nWt          |t                    rd}t          j        }d }n1|j        j        dk    rd}t          j        }d }nd}t          j        }d }t!          |          }| j        r@t%          j                    \  }}}|t)          j                     nt)          j                     t/          t1          t3          |          ||j        |                    S )	Nr   rM   ro  r  TemplateErrorInternalServerError)rp   typecoderq  )r   )r  r   vllm.exceptionsrN   r	   BAD_REQUESTr   
ValueErrorr   RuntimeErrorOverflowErrorr  NOT_IMPLEMENTEDrx   r{   ru   r   r   sysexc_info	traceback	print_excprint_stackr"   r!   rL   r   )	rw   rp   rp  rv   rq  excrN   exc_typer%  s	            ry   rH  z#OpenAIServing.create_error_response  su    !%gy)) 	C;;;;;;#233 ,(4C*i}!UVV ,(4C!455 0(8'?::,(40(>#hhG 	( \^^NHa##%%%%%'''(11 &	  
 
 
 	
rz   c                     t          j        |                     ||||                                                    }|S )Nrp   rp  rv   rq  )jsondumpsrH  
model_dump)rw   rp   rp  rv   rq  json_strs         ry   create_streaming_error_responsez-OpenAIServing.create_streaming_error_response"  sK     :&&!'	 '  
 jll
 
 rz   r   c                 f    |dk    r*t                               d|           t          d          dS )z:Raise GenerationError if finish_reason indicates an error.r   z:Request %s failed with an internal error during generationro   N)r   r   rn   )rw   r   r   s      ry   _raise_if_errorzOpenAIServing._raise_if_error3  sB    G##LLL   ""9::: $#rz   r   c                 V    |                      t          |          d|j                  S )z)Convert GenerationError to ErrorResponse.rt  rp  rv   )rH  r   rv   rw   r   s     ry   %_convert_generation_error_to_responsez3OpenAIServing._convert_generation_error_to_response<  s2     ))FF* * 
 
 	
rz   c                 V    |                      t          |          d|j                  S )z4Convert GenerationError to streaming error response.rt  r  )r  r   rv   r  s     ry   /_convert_generation_error_to_streaming_responsez=OpenAIServing._convert_generation_error_to_streaming_responseF  s2     33FF* 4 
 
 	
rz   c                   K   d }|                      |j                  rd S |j        | j        j        v rd S t          j        r{|j        rt| j                            |j                   d {V x}rMt          |t                    rd S t          |t                    r!|j
        j        t          j        j        k    r|}|p+|                     d|j         ddt          j        d          S )NThe model `` does not exist.NotFoundErrorr   r  )_is_model_supportedr   r   lora_requestsenvs VLLM_ALLOW_RUNTIME_LORA_UPDATINGresolve_lorar  rW   r"   r   rv  r	   rx  r   rH  	NOT_FOUND)rw   r   error_responseload_results       ry   rP  zOpenAIServing._check_modelP  s      ##GM22 	4=DK55541	-	- '+k&>&>w}&M&M M M M M M MM	-
 +{33 t;66-%*j.D.JJJ!, 
!;!;B'-BBB$",	 "< "
 "
 	
rz   c                    |                      |          }t                      }| j        j                                        D ] }|j        |v r|                    |           !t          |          dk    r|                                S dS )z;Determine if there are any active default multimodal loras.r   N)	_get_message_typessetr   r  r  	lora_nameaddr
  pop)rw   r   message_typesdefault_mm_lorasloras        ry   _get_active_default_mm_lorasz*OpenAIServing._get_active_default_mm_lorasn  s     //8855K-4466 	+ 	+D
 ~.. $$T***   A%%#'')))trz   supports_default_mm_lorasc                     |j         | j        j        v r| j        j        |j                  S |r|                     |          }||S |                     |j                   rd S t          d|j          d          )Nr  r  )r   r   r  r  r  ry  )rw   r   r  default_mm_loras       ry   _maybe_get_adaptersz!OpenAIServing._maybe_get_adapters  s    
 =DK555;,W];; % 	'"??HHO*&&##GM22 	4 Gw}GGGHHHrz   c                    t                      }t          |d          s|S |j        }|t          |t          t
          f          r|S |D ]y}t          |t                    rbd|v r^t          |d         t                    rC|d         D ]:}d|v r4|                    |d         	                    d          d                    ;z|S )zRetrieve the set of types from message content dicts up
        until `_`; we use this to match potential multimodal data
        with default per modality loras.
        messagesNcontentru  r%  r   )
r  r\  r  r  r   bytesdictr   r  split)rw   r   r  r  rp   content_dicts         ry   r  z OpenAIServing._get_message_types  s    
 #&%%w
++ 	!  #z(S%LAA   	N 	NG7D))N((wy1488 ) %,I$6 N NL--%)),v*>*D*DS*I*I!*LMMMrz   r  add_special_tokensc                   K   |                      |          }| j        j        4| j        j                            dd          r|                                }t          |dd           }| |||           d {V }n6|dk     r |||d| j                   d {V }n |||d|           d {V }|j        }|}	|                     |||	          S )Ndo_lower_caseFrX  r  r   T)r  
truncation
max_length)	rB  r   encoder_configr	  lowerrY  r   	input_ids_validate_input)
rw   r   r   r  r  rA  rX  encodedr  
input_texts
             ry   _normalize_prompt_text_to_inputz-OpenAIServing._normalize_prompt_text_to_input  sf      33I>> ,8!044_eLL 9 \\^^F!(2JD!Q!Q!)+O+=        GG $a''+O#5-	        GG ,O#51	        G %	
##GY
CCCrz   
prompt_idsc                   K   t          |dd           }||}n"|dk     r|| j         d          }n|| d          }|d}n0|                     |          }|                    |           d {V }|                     |||          S )NrX  r   r   )rY  r   rB  r  r  )rw   r   r  r  rX  r  r  rA  s           ry   !_normalize_prompt_tokens_to_inputz/OpenAIServing._normalize_prompt_tokens_to_input  s       ")2JD!Q!Q!)"II#a''"D$6#6#8#89II"$:#:#;#;<IJJ"77	BBO.55i@@@@@@@@J##GY
CCCrz   r  r  c                 l   t          |          }t          |t          t          t          t
          t          t          t          t          f          r|| j
        k    rit          dt
          dt          dt          dt          di}|                    t          |          d          }t          d| j
         d| d| dd|	          t          ||
          S t          |t          t           t"          f          rt          ||
          S t          |t$                    r|j        p|j        }nt+          |dd           }|| j
        k    rt          d| j
         d| dd|	          |=||z   | j
        k    r/t          d| d| j
         d| d| d| j
         d| dd|	          t          ||
          S )Nscoreclassificationzembedding generationz'This model's maximum context length is z  tokens. However, you requested z tokens in the input for z(. Please reduce the length of the input.input_tokensr   )r   r   r   z# tokens. However, your request has z> input tokens. Please reduce the length of the input messages.z6'max_tokens' or 'max_completion_tokens' is too large: z). This model's maximum context length is z tokens and your request has z input tokens (z > z - z).)r
  r  r4   r5   r=   rA   r>   r<   r1   r0   r   r	  ru  rN   rP   rI   rH   rG   r   max_completion_tokensr   rY  )rw   r   r  r  	token_num
operations	operationr   s           ry   r  zOpenAIServing._validate_input  sv    	NN	 $*  ,/)	
 
  	O 4---$g$g0'35E-/?;
 'NN4==:PQQ	)>)> > > >;D> > > -#     zINNNN &(;=NO
 
 	O  zINNNN g455 	> 6L':LJJ ,==J ***%&%& && & & )    !i*&<t?Q&Q&Q%$$ $%$ $ $ $ .8$ $ =A<N$ $  	$ $ $
 '     :	JJJJrz   Tprompt_inputc                 p   K   |                      |||g|          2 3 d{V }|c S 6 t          d          )zP
        A simpler implementation that tokenizes a single prompt input.
        r  Nz$No results yielded from tokenization)_tokenize_prompt_inputs_asyncry  )rw   r   r  r  r  r2  s         ry   _tokenize_prompt_input_asyncz*OpenAIServing._tokenize_prompt_input_asyncH  st       !>>N1	 ? 
 
 	 	 	 	 	 	 	& MMM
 ?@@@s   (prompt_inputsc                   K   |D ][}t          |t                    r#|                     ||||           d{V W V  :|                     |||           d{V W V  \dS )zQ
        A simpler implementation that tokenizes multiple prompt inputs.
        )r   r  r  N)r  r  )r  r   r  r  )rw   r   r  r  r  r   s         ry   r  z+OpenAIServing._tokenize_prompt_inputs_async[  s       $ 	 	F&#&&  @@!''9	 A              !BB%' C             	 	rz   request_chat_templatechat_template_kwargstrust_request_chat_templatec                 f    |s.||r*|                     d          |                     d          S d S )Nr   zChat template is passed with request, but --trust-request-chat-template is not set. Refused request with untrusted chat template.)r	  rH  )rw   r  r  r  s       ry   _validate_chat_templatez%OpenAIServing._validate_chat_templatet  sW     + 	!-$ . ),,_==I --@  
 trz   request_chat_template_kwargsdefault_chat_template_kwargsc                     | pi } || S || z  S )zIHelper to merge server-default and request-specific chat template kwargs.r   )r  r  s     ry   #_prepare_extra_chat_template_kwargsz1OpenAIServing._prepare_extra_chat_template_kwargs  s(     (D'Ir$'///+.JJJrz   r   r  r   r   add_generation_promptcontinue_final_message
tool_dicts	documentstool_parserc                 H  K   |||||	d|
pi }
|                      |
|          }
ddlm}  |j        |f||
                    dd          pt          |j        |          d|
 d {V \  }}d|vrO|}|                     ||                                |d         |	           d {V }|	                    |           n| 
                    ||d         d
           t          t          |          }|j        
|j        |d<   t          |dd           x}||d<   |d uot          |d          o
|j        dk    }|rat          |t"          t$          z            sd}t'          |          |                                } ||                              |          }||gfS )N)r   r  r  toolsr  r   )MistralTokenizertokenizeF)r   r  r   r   r  r   )r   r  r  r   
cache_salttool_choicenonezPTool usage is only supported for Chat Completions API or Responses API requests.r   )r  vllm.tokenizers.mistralr  render_messages_asyncr  r  r  r  get_tokenizerupdater  r   rP   r   rY  r\  r  r   r+   r  adjust_request)rw   r   r   r  r   r   r  r  r  r  r  r  r  r  r  conversationri  
extra_datar  should_parse_toolsmsgr  s                         ry   _preprocess_chatzOpenAIServing._preprocess_chat  s     " +%:&<" 
  
 $)r 
  $GG ( 
  
 	=<<<<<,JH,J-
)E$((U;; Dh02BCC-
 -
 #-
 -
 '
 '
 '
 '
 '
 '
#m ]22&J"&"C"C&&((h'#5	 #D # #      M   ,,,,  '(:; !    \=99&23:3NM/0!'<>>>JK*4M,'
 )4 
G]++M0Cv0M 	  		Mg'<?O'OPP /1  *#... ..00I!k),,;;G;LLGm_,,rz   )data_parallel_rankri  ra  r  c          
         K   i }t          | j        |j        |           | j                            ||||||||          }	|	|fS )z1Use the Processor to process inputs for AsyncLLM.)r   tokenization_kwargsr   ra  r  )rK   r   rX  r   process_inputs)
rw   r   ri  r   r   r   ra  r  r  engine_requests
             ry   _process_inputszOpenAIServing._process_inputs  ss       /1! =?R	
 	
 	
 -<<% 3'1 = 	
 	
 222rz   c           	      t   K   t          |          }|                     |||||||           d {V \  }	}
|
S )N)request_input)r  r  r   r   )r,   r  )rw   r   r   r  r  r  r   r   new_messagesr%  r   s              ry   _render_next_turnzOpenAIServing._render_next_turn
  s{       0"
 
 
 #'"7"7!#')E #8 #
 #
 
 
 
 
 
 
> rz   r   sampling_paramscontextc           	       K   |                      |          \  }}	}	|}
d}	 | d| }|                     ||||           |                    d          }|                     ||||||           d {V \  }} | j        j        |||f||||d|}|2 3 d {V }|                    |           |W V  "6 |                                sd S |                                 d {V }|	                    |           t          |t          t          f          r%|                                }t          |          }nt          |t                    rj|                     |j        |j        |j        j        |j        |j        |j        |j                   d {V }|d         }|                      |          \  }}	}	| j        t5          |d	                   z
  |_        |
d
z
  }|d
z  })Nr   Tr%  r`  r   rb  )r   ra  r  r  )r   r   r   )_get_prompt_componentsre  r	  r  r   r  append_outputneed_builtin_tool_call	call_toolappend_tool_outputr  r'   r)   render_for_completionrP   r(   r  r   r   r   response_messagesr  tool_parser_clsr   r   r   r
  r   )rw   r   ri  r  r  r   ra  kwargsr  r%  orig_prioritysub_requestsub_request_idr   r  r  rj  rm  tool_outputr   r   s                        ry   _generate_with_builtin_toolsz*OpenAIServing._generate_with_builtin_tools#  s      !77FFQ F	 *::[::N&)	     #JJ77M8<8L8L)+! 9M 9 9 3 3 3 3 3 3/N/ 4*3	 *!'$7	 	 	 	I '       c%%c*** '
 1133  !( 1 1 3 3333333K&&{333 'N4K#LMM O#*#@#@#B#B  ,>N O O OG_55 O'+'='=O$N4&+)8( ( " " " " " " !/q 1$($?$?$N$N!Q *.);c01? ? *O& %q(H1KMF	s   B8c                      t          |          S rr   )rR   )rw   r   s     ry   r  z$OpenAIServing._get_prompt_componentsy  s    $V,,,rz   inputsc                     | j         d S |                     |          \  }}}| j                             ||||||           d S )Nr`  )r   r  
log_inputs)rw   r   r  r   r   r   r   prompt_embedss           ry   re  zOpenAIServing._log_inputs|  si     &F262M2Mf2U2U/ -&&% 	' 	
 	
 	
 	
 	
rz   rd  c                    K   | j                                          d {V }|rt          |          S t          |          rt	                       d S rr   )r   is_tracing_enabledrf   re   rg   )rw   rd  r  s      ry   rc  z OpenAIServing._get_trace_headers  sf       $(#5#H#H#J#JJJJJJJ 	2(111!'** 	+(***trz   r   defaultc                 f    | | j                             d          x}|S |t                      n|S )z6Pulls the request id to use from a header, if providedNzX-Request-Id)rd  r	  rh   )r   r  req_ids      ry   _base_request_idzOpenAIServing._base_request_id  s=    
 ""*..~>>>VKM '{}}}W<rz   c                     | dS | j                             d          }|dS 	 t          |          S # t          $ r Y dS w xY w)z7Pulls the data parallel rank from a header, if providedNzX-data-parallel-rank)rd  r	  r   ry  )r   rank_strs     ry   _get_data_parallel_rankz%OpenAIServing._get_data_parallel_rank  sd     4&**+ABB4	x==  	 	 	44	s   3 
A Ar  r  c                    t          t                               }| j        rPt          | j        t                    r6|J |                    t          | j        j        |                     d }n| j        rUt          | j        t                    r;|J |                    t          | j        j        j        |                     d }n5| j        dk    rX|J t          t           t                                                 |          }|                    d |D                        d }n|r|r| j        dk    s| j        |t          d          	  ||          }n.# t          $ r!}t                              d           |d }~ww xY w|                    ||nd|           }	|	O|	j        rH|                    d	 |	j        D                        |	j        }|r|                                dk    rd }nd |fS ||fS )
Nname	argumentsrequiredc           	      l    g | ]1}t          |j        t          j        |j        d                     2S )F)ensure_asciir  )r#   r  r  r  
parametersr   	tool_calls     ry   r   z@OpenAIServing._parse_tool_calls_from_content.<locals>.<listcomp>  sT       
 "	 !&^"&*Y-APU"V"V"V    rz   autoz7Tokenizer not available when `skip_tokenizer_init=True`zError in tool parser creation.r   r  c              3   `   K   | ])}t          |j        j        |j        j                   V  *dS )r  N)r#   functionr  r  r#  s     ry   	<genexpr>z?OpenAIServing._parse_tool_calls_from_content.<locals>.<genexpr>  sY       & &
 "	 !&/4"+"4">  & & & & & &rz   )r   r#   r  r  r   r  r  r   r'  r   r$   validate_jsonr  ry  rz  r   	exceptionextract_tool_callstools_called
tool_callsr  strip)
r   r  r   r  r  function_callsr-  r  r   tool_call_infos
             ry   _parse_tool_calls_from_contentz,OpenAIServing._parse_tool_calls_from_content  s    l+-- ?	%:g.ACU#V#V ?	%&&&!!'"5":gNNN   GG  8	%Z!C&
 &
 8	% &&&!!'"5">"CwWWW   GG J..&&&$T*<%=>>LLWUUJ!! 
 &0     GG"	%!"	% $..'2E2M  M  
-oi88     !ABBB );;".B <  N )n.I)%% & &
 &4%>& & &    )0 #w}}"44"G W}$w&&s   E$ $
F.F

Fr   r7  return_as_token_idc                 x    |rd| S | j         | j         S |t          d          |                    |          S )Nz	token_id:z:Unable to get tokenizer because `skip_tokenizer_init=True`)decoded_tokenry  r  )r   r7  r  r2  s       ry   _get_decoded_tokenz OpenAIServing._get_decoded_token  s]      	*)x))) ,((L   )))rz   r   c                 >    |sdS | j                             |          S )NT)r   is_base_model)rw   r   s     ry   r  z!OpenAIServing._is_model_supported  s%     	4{((444rz   )NF)r   N)NN)F)T)TFNNNNNF)Nr   rr   )gr{   r|   r}   r   r   r   r   r   r%   r   boolrt   r   rb   rc   r   r]   r   r   rO   r`   rW   r   r   r[   r:  rB   r=  r
   rD   r?  ri   rB  r   r"   rF  r   rI  rN  rK  rQ  r\   r]  rR  rS  r	   rx  r   rH  r  r  rn   r  r  r   rP  r  r  r  r  rP   r  r   r   r  r  r  r   r  r  r  staticmethodr  r   r+   r_   r   r   r   r   r  ra   rl   r  r*   r  r&   r  rQ   r  re  r   rc  r   r  r  r   r#   r1  rU   r5  r  r   r   s   @ry   r   r      s         (x}    ,1 %= = =#= $=
 &,= %)= = = = = = =8 NS  #d
FJ	=/:-	.	5   2" 
=/?2	3d	:   2 2 2 2 ,026m
 m
m
 m
 !	m

 "D(m
 sCx(4/m
 
t+	,m
 m
 m
 m
^	
, 	
 	
 	
 	
"" 
" " " "	1I 	 	 	 	 
	   DD 
}	$D D D D
O
O 
}	$
O 
O 
O 
O(( 
m3T9	:( ( ( (0\ md6J    	/	/ 
	&	/ 	/ 	/ 	/1111 
	11 11 11 11f11 
	1 1 1 1D *","8 5
 5
y5
 5
  	5

 Tz5
 
5
 5
 5
 5
t *","8  y   	
 Tz 
   ";S4Z ;S ;T ; ; ; ;
 
	
 
 
 

 
	
 
 
 


 
	
 
 
 
<J ;QUCU    4 +0I II $(I 
t		I I I I** S    2(D(D (D !	(D
 !(D 
(D (D (D (DTDD ID !4'	D
 
D D D D.RKRK 9RK 	RK
 
RK RK RK RKr $(A AA !A DIo	A
 !A 
A A A A0 $(  !  d3i0	
 ! 
d*	+   2"Tz #38nt3 &*	
 
	   ( >B>B	K 	K&*38nt&;	K&*38nt&;	K 
c3h	K 	K 	K \	K$ '+',26156:>BDH#(W- W- #33W- W- 12	W-
 TzW- 'FW-  $W- !%W- c3h(4/W- S#X'$.W- #38nt3W- '+38nt&;W- }oz9:TAW- !W- 
t'($|*<<	=W- W- W- W-D *.3 3 33 "3 .	3 "D(3 sCx(4/3 3  $J3 
 $sCx.0	13 3 3 3:!  ./	
 c3h(4/ Tz 'F   > ,0T TT $T (	T
 %T "D(T T T T Tl-Z -<L - - - -

 
 .1AADH	

 "D(
 

 
 
 
* 
c	T	!    ;?	= 	=t^	=.1Dj	=	t	= 	= 	= \	= Wt^ d
    \  #I' I'!$99I' 4'I'  I' "=/:"=>E	I'
 tI' 
tL!D(#*4	5I' I' I' \I'V 
 $)	* *** !4'* !	*
 
* * * \*&5cDj 5T 5 5 5 5 5 5 5 5rz   r   r   r   c                     | | S | D ];}||                                 D ]!}|j        t          d          k    rd|_        "<| S )Nz-infg    )r  r   r  )r   logprob_dictlogprob_valuess      ry   clamp_prompt_logprobsr=     sm     ' 1 1*1133 	1 	1N%v66)0&	1 rz   )r  r  r}  r   r  collections.abcr   r   r   r   dataclassesr   r   httpr	   typingr
   r   r   r   r   r   numpyr  fastapir   openai.types.responsesr   pydanticr   r   starlette.datastructuresr   	vllm.envsr  vllm.beam_searchr   r   vllm.engine.protocolr   vllm.entrypoints.chat_utilsr   r   r   vllm.entrypoints.loggerr   0vllm.entrypoints.openai.chat_completion.protocolr   r   r   +vllm.entrypoints.openai.completion.protocolr   r    'vllm.entrypoints.openai.engine.protocolr!   r"   r#   r$   &vllm.entrypoints.openai.models.servingr%   )vllm.entrypoints.openai.responses.contextr&   r'   r(   r)   *vllm.entrypoints.openai.responses.protocolr*   r+   'vllm.entrypoints.openai.responses.utilsr,   -vllm.entrypoints.openai.translations.protocolr-   r.   r/   *vllm.entrypoints.pooling.classify.protocolr0   r1   r2   r3   'vllm.entrypoints.pooling.embed.protocolr4   r5   r6   r7   )vllm.entrypoints.pooling.pooling.protocolr8   r9   r:   r;   'vllm.entrypoints.pooling.score.protocolr<   r=   r>   r?   r@   rA   vllm.entrypoints.rendererrB   rC   rD   &vllm.entrypoints.serve.disagg.protocolrE   rF   (vllm.entrypoints.serve.tokenize.protocolrG   rH   rI   rJ   vllm.entrypoints.utilsrK   rL   rw  rN   vllm.inputs.datarO   rP   vllm.inputs.parserQ   rR   rS   vllm.loggerrT   vllm.logprobsrU   rV   vllm.lora.requestrW   vllm.multimodalrX   vllm.outputsrY   rZ   r[   vllm.pooling_paramsr\   vllm.reasoningr]   r^   vllm.renderersr_   vllm.sampling_paramsr`   ra   vllm.tokenizersrb   vllm.tool_parsersrc   rd   vllm.tracingre   rf   rg   
vllm.utilsrh   vllm.utils.async_utilsri   rj   rk   vllm.v1.enginerl   r   rn   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r=  r   rz   ry   <module>rm     s	     



      G G G G G G G G G G G G ( ( ( ( ( ( ( (       C C C C C C C C C C C C C C C C                - , , , , , , , , , , , , ,       O O O O O O O O - - - - - -         
 2 1 1 1 1 1         
                   G F F F F F                                
                                                 U T T T T T T T T T T T T T T T T T            O N N N N N N N / / / / / / 5 5 5 5 5 5 5 5         
 $ # # # # # 1 1 1 1 1 1 1 1 ) ) ) ) ) ) . . . . . . N N N N N N N N N N - - - - - - B B B B B B B B ' ' ' ' ' ' A A A A A A A A ) ) ) ) ) ) ; ; ; ; ; ; ; ;         
 # " " " " "         
 - , , , , ,< < < < <i < < < 
X		   !! &	&
    y 	 	 	     	     "68J!J Y J J J  	
  I      	
     Y 
 
 
 7:Z000 4L L L L L L L L 4< < < < < < < <  4, , , , ,)+BGHDU , , , 4	 	 	 	 	.C!D 	 	 	 4B B B B BL)9: B B B
t5 t5 t5 t5 t5 t5 t5 t5n)#d*d     rz   