
     `i                    `   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&Z&d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9 ddl1m:Z:  e8            rd dl;Z;d dl&m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZB  e,            rd dlCZC e0            rd dlDmEZE  e.            o e+            o e/            o	 e-            ZFeFrd dlGZGd dlHmIZImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[ d dl\m]Z] d dl^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZm d dlnmoZo d d lpmqZqmrZrmsZs  G d! d"eod#$          Zt G d% d&e]d#$          Zu G d' d(eSd#$          Zv eret          Zw ereu          Zx erev          Zyh d)Zzh d*Z{h d+Z| e9j}        e~          Zd,d-d.d/iZ ee                                          Zd0Z G d1 d2ej                  Zd3efd4Zd5ed6d7d8d7fd9Z G d: d;          Z G d< d=          Ze G d> d?                      Z G d@ dAe:          Ze~dBk    r  e            Ze                                 dS dS )C    N)ArgumentParser	Namespace)AsyncGenerator	GeneratorIterable)asynccontextmanager)	dataclassfield)BytesIO)Thread)Optional	TypedDictUnion
model_infoHF_HUB_OFFLINE)DecodeStream)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                       e Zd ZU dZeed<   dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__     q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/commands/serving.pyrP   rP   {   *         	 	 rZ   rP   F)totalc                       e Zd ZU dZeed<   dS )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rQ   NrR   rY   rZ   r[   r_   r_      r\   rZ   r_   c                   6    e Zd ZU dZeed<   eed<   dZeed<   dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerQ   FstreamN)	rS   rT   rU   rV   bytesrX   rW   rc   boolrY   rZ   r[   ra   ra      sC         	 	 rZ   ra   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstoprg   audiorh   logprobsmetadata	functions
modalities
predictionrn   ro   rp   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   ri   rj   languager}   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                       e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rS   rT   rU   r   r   r   r   rY   rZ   r[   r   r      s"        
C
C
C
CCCrZ   r   argsc                      t          |           S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)r   s    r[   serve_command_factoryr      s     rZ   reqmodel_generation_configr)   returnc                    |                      d          %t          di t          j        | d                   }nt	          j        |          } |j        di |}|                                D ]\  }}|t          |||           |                      d          t          | d                   |_
        |                      d          t          | d                   |_
        |                      d          t          | d                   |_        |                      d          | d         |_        |                      d          | d         |_        |                      d          :t          | d                   |_        t          | d                   d	k    rd
|_        |                      d          t          | d                   |_        |                      d          t%          j        | d                    |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rQ   Nmax_output_tokens
max_tokensfrequency_penalty
logit_biasrt   temperatureg        Ftop_pseedrY   )getr)   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrQ   non_standard_kwargskvs          r[   !create_generation_config_from_reqr      s   . ww"##/,TTtz#>Q:R/S/STT M*ABB2+2<<V<<#))++ - -1=%q!,,, ww"##/+.s3F/G+H+H( ww|(+.s</@+A+A(
ww"##//4S9L5M/N/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A(A%]#$$++*/'
www#"'G"5"5
wwv"#f+&&&rZ   c                       e Zd ZdZd Zd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 .    |                                   d S N)resetselfs    r[   __init__zToolState.__init__'  s    

rZ   c                 >    d| _         d| _        d| _        d| _        dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    r[   r   zToolState.reset*  s%     %%*"!"rZ   N)rS   rT   rU   rV   r   r   rY   rZ   r[   r   r   $  s8        AA      rZ   r   c            	       Z    e Zd ZdZ	 ddddedeed                  fdZd	 Zd
 Z	d Z
d ZdS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr*   timeout_seconds	processor)r!   r    c                     || _         t          |j                  | _        || _        || _        t          j        | j        | j                  | _	        | j	        
                                 d S r   )r   rW   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr   )r   r   r   r   s       r[   r   zTimedModel.__init__8  s`     
 !344".od&:D<PQQrZ   c                     | j                                          t          j        | j        | j                  | _         | j                                          dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    r[   reset_timerzTimedModel.reset_timerE  sI    od&:D<PQQrZ   c                 0   t          | d          r| j        ~| `| `d| _        d| _        t          j                     t
          j                                        rt
          j                                         | j	        
                                 dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   cudais_availableempty_cacher   r   r   s    r[   delete_modelzTimedModel.delete_modelK  s    4!! 	!dj&<
DJ!DNJLLL z&&(( )
&&((( K     	! 	!&<&<rZ   c                     |                                   t                              | j         d| j         d           d S )Nz was removed from memory after z seconds of inactivity)r   loggerinfor   r   r   s    r[   r   zTimedModel.timeout_reached[  sC    t)vv$J^vvvwwwwwrZ   c                 6    t          | d           p| j        du S )z)Check if the instances have been deleted.r   N)r   r   r   s    r[   
is_deletedzTimedModel.is_deleted_  s!    4)))?TZ4-??rZ   r   )rS   rT   rU   rV   r   r   r   r   r   r   r   r   rY   rZ   r[   r   r   2  s          SW	    E"MNO	     ! ! ! x x x@ @ @ @ @rZ   r   c                   8   e Zd ZU dZ edddi          Zeed<    edddi          Ze	ed	<    ed
dg dd          Z
ee	         ed<    eddg dd          Zee	         ed<    edddi          Zeed<    ed
ddi          Zee	         ed<    edddi          Zeed<    edddi          Zeed<    eddddgd          Ze	ed<    edddi          Zeed<    eddd i          Ze	ed!<    ed"dd#i          Zeed$<    ed%dd&i          Zeed'<    ed(dd)i          Ze	ed*<    ed
dd+i          Zee         ed,<    eddd-i          Zeed.<    eddd/i          Zeed0<    ed
dd1i          Zee	         ed2<   d3 Zd
S )4ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    Fhelpz8Whether to use continuous batching for chat completions.)defaultrw   continuous_batchingautozfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   bfloat16float16float32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypez2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                     | j         E| j        | j         | _        dS | j         | j        k    r"t          d| j          d| j         d          dS dS )z(Only used for BC `torch_dtype` argument.Nz`torch_dtype` z and `dtype` zn have different values. `torch_dtype` is deprecated and will be removed in 4.59.0, please set `dtype` instead.)r   r   
ValueErrorr   s    r[   __post_init__zServeArguments.__post_init__  s     'z!!-


!TZ// MT%5 M MDJ M M M  	 (' 0/rZ   )rS   rT   rU   rV   r
   r   re   rX   r   rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rY   rZ   r[   r   r   d  s          !&TU! ! !    % >
  FC    "'WAAA
 
" " "K#    !5PAAA
 
  E8C=    $e)] ^  t    */ r
* * *#    ef  L$    ef  L$     %uUFZhmotgu=v=vwwwwww!&uHm?n!o!o!o$ooo kV=c4deeeD#eeedf6W-XYYYD#YYY\]  M3    U&*d!e  Is    #(%([\# # #L(3-    &
  K    #UB
  d    "'2
" " "K#   
 
 
 
 
rZ   r   c                      e Zd Zedefd            ZdefdZdede	ddd	e
fd
ZdefdZdefdZdefdZ	 	 	 	 	 	 	 	 d2dedee         dee         dee         dee         deed                  dee         dee         defdZdddefdZd Zej        deeeef                  fd            Zd ededeedf         fd!Zedd"defd#            Zed$efd%            Z d ede!eddf         fd&Z"d ede!eddf         fd'Z#d ede!eddf         fd(Z$d ede%fd)Z&ededed*         fd+            Z'd,edefd-Z(d.efd/Z)d.ede*d"ef         fd0Z+d.ede*d"e,f         fd1Z-dS )3r   parserc                 z    t           f}|                     d|          }|                    t                     dS )z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r   r  serve_parsers      r[   register_subcommandz ServeCommand.register_subcommand  sB     *+((/(RR!!'<!=====rZ   r   c           	         t           st          d          || _        | j        j        | _        | j        rt          j                    }| j        j        )|| j        _        t          	                    d|            t          j
                    }| j        j        |vr#t          d| d| j        j         d| d          | j        j        | _        | j        j        t          j        | j        j                   t!          j        d          }|                    t           j        | j        j                                                            t!          j        d          }|                    t           j        | j        j                                                            i | _        d | _        d | _        d | _        d | _        d S )	NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`z-No attn_implementation passed, defaulting to z"Continuous batching only supports z as attn_implementation, got z#Try setting `--attn_implementation=`transformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr+    default_attention_implementationr   r   r   #supported_attention_implementationsr   r   r   r   r   r$   
get_loggersetLevel
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   default_attn_implsupported_attn_impltransformers_logger	cb_loggers         r[   r   zServeCommand.__init__  s   + 	s  
 	'+y'D$' 	 9 Z \ \y,40A	-_L]__```";"_"a"ay,4GGG O9L O Oy4O O:KO O O  
  909!-di4555 &0@@$$W%7	8K8Q8Q8S8S%TUUU&'TUU	7-di.A.G.G.I.IJKKK 57X\0 "!rZ   requestschema	validatorrM   unused_fieldsc                    t                               d|            t          |                                          }|j        }||z
  }|r1t                               d|            t          dd|           | j        j        r	 |	                    |           nd# t          $ rW}t                               d|                                            t          d|                                          d}~ww xY w||z  }	|	r3t                               d|	            t          dd|	           dS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr/   r   r   validate_pythonrN   errors)
r   r  r  r   r!  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             r[   _validate_requestzServeCommand._validate_request  s   . 	5G55666 ((
/$}4 	nLLMOMMNNNC8l[j8l8lmmmm9% 	H))'2222" H H H>!((**>>???#AHHJJGGGGH
 (2M'A$' X>VXXYYY# #,gMe,g,g   	 	 s   B! !
D+AC==Dc                 V    |                      |t          t          t                     d S N)r  r  r   r!  )r1  rP   response_validatorUNUSED_RESPONSE_FIELDSr   r  s     r[   validate_response_requestz&ServeCommand.validate_response_requestA  s5    <(0	 	 	
 	
 	
 	
 	
rZ   c                 V    |                      |t          t          t                     d S r3  )r1  r_   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr6  s     r[    validate_chat_completion_requestz-ServeCommand.validate_chat_completion_requestI  s5    >*7	 	 	
 	
 	
 	
 	
rZ   c                 V    |                      |t          t          t                     d S r3  )r1  ra   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr6  s     r[   validate_transcription_requestz+ServeCommand.validate_transcription_requestQ  s5    8-5	 	 	
 	
 	
 	
 	
rZ   r   N
request_idcontentr   rolefinish_reason
tool_callsr9   decode_stream	tokenizerr   c	                     ||||                     |j        |          }t          |t          t	          j                              |t          t          |||          d|          gdd          }	d|	                    d	
           dS )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        N)rA  rB  rD  r   )deltaindexrC  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)step
_tokenizerr6   r   timer7   r8   model_dump_json)
r   r@  rA  r   rB  rC  rD  rE  rF  chunks
             r[   build_chat_completion_chunkz(ServeCommand.build_chat_completion_chunkY  s    D $)<AV#(()=wGGG#	$$% '!#-  
 "/  
  "*!
 
 
$ G--4-@@FFFFrZ   responserL   c                 6    d|                     d           dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        rN  TrO  rQ  )rU  )r   rX  s     r[   build_response_eventz!ServeCommand.build_response_event  s'     J00d0CCIIIIrZ   c                 8    t           dt          f fd            }t          |          } j        r<|                    t          dgddgdg           t
                              d           dd	lm} |	                    d
          d|dt          f fd            }|	                    d          dt          f fd            }|	                    d          d|f fd            }|                    d          |                    d           fd                        }|                    d          d             }|                    d          d|fd            }	t          j        | j        j         j        j         j        j                   dS )a  
        Setup and run the FastAPI server for transformers serve.

        Models will be loaded and unloaded automatically based on usage and a timeout.

        The server will expose the following endpoints:
        - POST /v1/chat/completions: Generates chat completions.
        - POST /v1/responses: Generates responses.
        - POST /v1/audio/transcriptions: Generates transcriptions from audio.
        - GET /v1/models: Lists available models for 3rd party tools.

        Requires FastAPI and Uvicorn to be installed.
        appc                   K   d W V  j                                         D ]}|                                 j        j                            dd           d S d S )NT   blocktimeout)r  valuesr   r  rt   )r\  r   r   s     r[   lifespanz"ServeCommand.run.<locals>.lifespan  sz      EEEE+2244 % %""$$$$7C8==DRS=TTTTT DCrZ   )rc  *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsr  bodyc                                          |           j        r!                    || j        j                  }n                    |          }t          |d          S Nr  text/event-stream
media_type)r;  r  #continuous_batching_chat_completionstater@  generate_chat_completionr2   )r  rj  outputr   s      r[   chat_completionz)ServeCommand.run.<locals>.chat_completion  sh    11$1???+ =AA$H`aa66t<<$V8KLLLLrZ   z/v1/responsesc                 |                         |                                |           }t          |d          S rl  )r7  generate_responser2   )r  rt  r   s     r[   	responsesz#ServeCommand.run.<locals>.responses  sA    **7*;;;++G44F$V8KLLLLrZ   z/v1/audio/transcriptionsc           
        K   |                                  4 d {V }t          |d                                          d {V |d                   }t                              d|d         j         d|d         j         d|d         j        dz  dd	           d d d           d {V  n# 1 d {V swxY w Y                       |
           	                    |          }t          |d          S )Nrb   r   )rb   r   zReceived file: z; MIME type: z; size:    z.2fz KiBrm  rn  ro  )formra   readr   r%  filenamecontent_typesizer?  generate_transcriptionr2   )r  r{  parsed_requestrt  r   s       r[   audio_transcriptionsz.ServeCommand.run.<locals>.audio_transcriptions  s      ||~~ 	 	 	 	 	 	 	!F#F|0022222222w-" " "
 @d6l&; @ @$v,Jc @ @!&\.5?@ @ @  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ///GGG00@@F$V8KLLLLs   A?B//
B9<B9z
/v1/modelsc                  L    t          d                                 d          S )Nlist)rM  data)r1   get_gen_modelsr   s   r[   get_all_modelsz(ServeCommand.run.<locals>.get_all_models  s'      64;N;N;P;P Q QRRRrZ   z/healthc                  $    t          ddi          S )Nstatusok)r1   rY   rZ   r[   healthcheckz%ServeCommand.run.<locals>.healthcheck  s    4 0111rZ   httpc                    K   | j                             t                    pt          t	          j                              }|| j        _         ||            d {V }||j         t          <   |S r   )headersr   X_REQUEST_IDrW   uuiduuid4rr  r@  )r  	call_nextr@  rX  s       r[   get_or_set_request_idz/ServeCommand.run.<locals>.get_or_set_request_id  sk       ,,\::Oc$*,,>O>OJ'1GM$&Yw////////H-7H\*OrZ   )r   r   r   N)r   r.   r   add_middlewarer0   r   warning_oncefastapiri  postdictoptionsr   
middlewareuvicornrunr   r   r   r   )
r   rc  r\  ri  ru  rx  r  r  r  r  s
   `         r[   r  zServeCommand.run  sm    
	U 	U 	U 	U 	U 	U 
		U x(((  
	"e"&"e"e     g   	$#####	(	)	)	MW 	MD 	M 	M 	M 	M 	M 
*	)	M 
/	"	"	Mt 	M 	M 	M 	M 	M 
#	"	M 
,	-	-	M 	M 	M 	M 	M 	M 
.	-	M" 
\	"	"				S 	S 	S 	S 
	 
#	"	S 
			2 	2 
		2 
			 	 	 	 
 		 	Cdin49>TYM`aaaaaarZ   c                 `    g d}t           rd |D             S d |D             }d |D             S )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructc                     g | ]P}|d t           j                                                                         |                    d          d         dQS )r   /r   rJ  rM  rK  owned_by)datetimenow	timestampsplit.0r   s     r[   
<listcomp>z/ServeCommand.get_gen_models.<locals>.<listcomp>  si          %'04466@@BB %C 0 0 3	   rZ   c                 ,    g | ]}t          |          S rY   r   r  s     r[   r  z/ServeCommand.get_gen_models.<locals>.<listcomp>#  s     AAA:e,,AAArZ   c                 ^    g | ]*}|j         d |j                                        |j        d+S )r   r  )rJ  
created_atr  authorr  s     r[   r  z/ServeCommand.get_gen_models.<locals>.<listcomp>$  sP          (%$/99;; %	   rZ   r   )r   modelsmodel_infoss      r[   r  zServeCommand.get_gen_models   sy    
 
 
  	  $    BA&AAAK  )   rZ   r   c           	         	
                       |d                   		 j        k    }	 _        |r* j        # j                            dd           d _                             	          \  }}t          |d          r|j        n|t          ||j        j	        j
        ddd	           j        M|                    d
           _        t                       j        _         j                                         |                    |d         dd                              |j                  }	 fd
 
fd} ||d         |          S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r_  rF  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rQ   	streamingmessagespt)return_tensorsadd_generation_promptc              3   
  K   	                      | d          V  j                            |           D ][}|j        t          j        k    r                     | d          V   d S                      | |j        d         |          V  \d S # t          $ ra}t          	                    t          |                     j                            |            dt          |           dV  Y d }~d S d }~ww xY w)	N	assistantrB  r   rt   rC  r   )r@  rA  r   rE  rF  data: {"error": ""})rW  r  request_id_iterr  r,   FINISHEDgenerated_tokens	Exceptionr   r)  rW   cancel_request)r@  rE  resultr/  model_id_and_revisionr   rF  s       r[   stream_chat_completionzPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completion^  sf     7 66z[p6qqqqq"FVVWabb  F}(>>>">>&*0"7 ?     
 ">>'1$*$;B$?"7*7&/ ?       "  7 7 7SVV$$$8GG
SSS63q666666666666667s   A&B -(B 
D!AC==Dc                  K   	 t          |                                 d          }j                            | |j                  } ||          D ]!}|W V  t          j        d           d {V  "d S # t
          j        $ r< j                            |           t          
                    d| d           Y d S w xY w)NF)r@  r   r   zRequest z was cancelled.)r   tolistr  add_requestr   asynciosleepCancelledErrorr  r   warning)_inputsr@  rE  rV  rQ   r   r  s       r[   cancellation_wrapperzNServeCommand.continuous_batching_chat_completion.<locals>.cancellation_wrapperz  s	     G ,W^^-=-=u E E!EQQ
CTCc R  
 43JNN + +EKKKK!-**********+ + ) G G G8GG
SSSE*EEEFFFFFFGs   A2A9 9ACCr   )process_model_namer  r  rt   load_model_and_processorr   rF  r   rQ   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )r   r   r@  must_discard_cacher   r   inputsr  rQ   r  r  rF  s   `       @@@@r[   rq  z0ServeCommand.continuous_batching_chat_completion.  s    !% 7 7G E E2doE/ 	@7C8==DRS=TTT;?8889NOOy+29k+J+JYI''PY	=$)$;"/"/
 
 
 3;7<7U7U"3t 8V 8 8D4 H[G\G\D4D4::<<< ..s:tko.ppssL
 
	7 	7 	7 	7 	7 	7 	78	G 	G 	G 	G 	G 	G 	G $#F1Iz:::rZ   r*   c                     | j         j        }|t          j                    v rt          j        }n4|t          j                    v rt          j        }nt          d|           |S )NzUnknown modality: )		__class__rS   r   rb  r   r   r   r   r   )r   model_classnamemodalitys      r[   get_model_modalityzServeCommand.get_model_modality  sd    /2HOQQQQ|HH A H J JJJ|HHC/CCDDDrZ   r  c           	      N   g }| D ]}|d         g d}|t           j        k    rt          |d         t                    r	|d         }ndt          |d         t                    rIg }|d         D ])}|d         dk    r|                    |d                    *d                    |          }||d<   n\|t           j        k    rKt          |d         t                    r&|d                             d|d         d           n
|d         D ] }|d         dk    r|d                             |           +|d         dk    rd	|d         d
         v rt          j	        dd|d         d
                   }t          j        t          t          j        |                              }t          j        dd          }	|	j        }
|                    |	j                   n|d         d
         }
|d                             d|
d           |                    |            |S )NrB  rB  rA  rA  typerf    )r  rf   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerW   r  appendjoinr   resubr-   openr   r  	b64decodetempfileNamedTemporaryFilenamesave)r  r  processor_inputsmessageparsed_messageparsed_contentrA  
image_datar  rb   r  s              r[   *get_processor_inputs_from_inbound_messagesz7ServeCommand.get_processor_inputs_from_inbound_messages  sN    '	4 '	4G&-fo"EEN8<'' gi0#66 >%,Y%7NN	 2D99 >%'N#*9#5 C C"6?f44*11'&/BBB%(XXn%=%=N,:y))X\)) gi0#66 \"9-44fgV_N`5a5abbbb#*9#5 \ \"6?f44*95<<WEEEE$V_;;'7;+?+FFF-/V4LbRYZeRfglRm-n-n
(-
76;KJ;W;W3X3X(Y(Y'/'B&Y^'_'_'_&*i %

49 5 5 5 5&-k&:5&A*95<<gVY=Z=Z[[[##N3333rZ   c                 
     j         j         j         j        |d<   |d         }|d         d         dk    rdS                      |d                    j        k    } _                                       \  }                               }                     ||          }dt          D ],}|j        j	        d         
                                v r| n-|                    |d|                    d	          d
dd          }|                    j                  }|                    dd          d}	dj        j	        d         
                                v rd}	t          ||	d          }
t!          |j                  }d}                     |          r9|s7 j                                        }|d         j        d         |k    r j        }i ||
|d|d fd} ||
          S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r  rB  r  r   Ttoolsr  )r  r  r  return_dicttokenizer@  req_0gptossFskip_special_tokensskip_promptr   	input_ids)streamerrQ   return_dict_in_generatepast_key_valuesc              3     K   d}d }dj         j        d                                         v rd}d}fd}t          |          }d}	 |                                 t                      }                    d	
          V  | D ]M}dj         j        d                                         v r|                    d          }||z  }|r||v rd}LM|                                t                   d         k    rd|_
        |                                t                   d         k    r0|                                                     |d d          V  |j
        rK|xj        |z  c_        |j        s_t          j        d|j                  }	|	|	                    d          }	d|_        t#          t%          |	          dd|dz             }
n|dk    r`d|j        vrk|xj        |                    d          z  c_        |xj        |                    d          z  c_        |j        dk     r3d                    |                    d          d d                   dz   }t#          t%          |          dd          }
                    |d |
g          V  -|dk    r                    ||          V  O                    |d          V  |                                 nS# t.          $ rF}t0                              t5          |                     d t5          |           d!V  Y d }~nd }~ww xY w|                                 d S # |                                 w xY w)"NFr  r   T<|channel|>final<|message|>c                  :     j         di | }|j        _        d S NrY   generater  r  r   generate_outputr   r   s     r[   generate_with_cachezbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache  +    "0%.":":6":":%4%D"""rZ   targetr   r   r  r  
<|return|>r   r   rD  )r@  rB  rC  r   z\"name\": \"(.*?)\"r%   )r  function
_tool_call)r  rI  r  rJ  z"arguments": {{})	arguments)r  rI  r  )r@  rB  rD  r   )rA  r   rt   r  r  r  )configarchitecturesr  r   r   r   rW  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr9   r:   r   countr  r  r  r   r)  rW   )r  _request_id
filter_cotcot_trace_endr  threadresults
tool_stater  	tool_nametoolr/  generation_kwargsr   r  r@  r   tool_model_familys               r[   r  zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion  sX      J M5<5a8>>@@@@!
 =E E E E E E #6?PQQQFGg&[[
 66z[p6qqqqq& V VF5<#=a#@#F#F#H#HHH!'!4!4\!B!Bv%G " %(G33).J$$ )4!<<>>->?P-QRY-ZZZ:>J7$ "<<>>->?P-QRW-XXX&,,..."&"B"B+6%).:&;	 #C # #    %%6 .%&--7-- $.#C $",.I6LjN_,`,`	#,#4$,090B0BICG
 @':-Hi-X-X-X*+)3'2\'A	(" (" (" $*R<<$, $4:;L#L#L$, !+ < <S@Q@Q Q < < * < <S@Q@Q Q < <#-#?!#C#C-/WWV\\#5F5Fss5K-L-Ls-RF':-HSY-Z-Z-Z*+)3(" (" (" #'"B"B+6Ttf\q #C # #    % ||">>'?T ?      66{RX`u6vvvvv 7 7 7SVV$$$63q66666666666667
 s1   J;L M1 
M<MM1 MM1 1N)r   r   r  r  r  r  r  _MODELS_WITH_TOOL_SUPPORTr  r  r  r  r   r  r   r"   r   rQ   is_continuationr  get_seq_lengthshape)r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerrQ   r  seq_lenr  r+  r   r  r@  r,  s   `              @@@@@r[   rs  z%ServeCommand.generate_chat_completion  s    9 ,90CL9<Z B<;..F $ 7 7G E E2doE/889NOOy**511JJ8U]^^ !(A 	 	$'5<+Ea+H+N+N+P+PPP$<! Q .."&'''"" / 
 
 5<((WW\733
 #u|1!4::<<<<"'2 3
 
 

 >c[`[rsss$$ 	3-? 	3(7799Gk"(,w66 $ 2

+!2'+,
 
 
x	 x	 x	 x	 x	 x	 x	 x	 x	 x	t &%&9:FFFrZ   c                                           d                    j        k    } _                                       \  }t          d         t                    r1dv rdd         dgng }|                    dd         d           nt          d         t                    rTdv rGd         d         d         dk    rdd         dgd         }n{d         }d         |d         d	<   nad         }nXt          d         t                    r.dv rdd         dgng }|                    d                    nt          d
          |	                    |dd          }|
                    j                  }                    dd          d}dj        j        d                                         v rd}t!          ||d          }t#          j                  }d}                               r9|s7 j                                        }	|d         j        d         |	k    r j        }|t/          j        |          ||d|d fd}
 |
|          S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  rg   r   rB  rA  z%inputs should be a list, dict, or strTr  )r  r  rr   r  r  Fr   r  Nr  r  )r  attention_maskr  rQ   r  r  c              3   
  K   d}d }dj         j        d                                         v rd}d}fd}t          |          }d}d}d}	 |                                 t          j                    }	t          d|t          d	 |	d
                    d          dddiidg g                     dd          d                    d                              }
|dz  }	                    |
          V  t          d|t          d	 |	d                    d          dddiidg g                     dd          d                    d                              }|dz  }	                    |          V  t          d||t          d dddg                     }|dz  }	                    |          V  t          dd |||t          dd g !          "          }|dz  }	                    |          V  d }| D ]}dj         j        d                                         v r|                    d#          }||z  }|r
||v rd}d }MNt!          d$d ||||d d%d&g'          }|dz  }	                    |          V  t#          d(d ||d|d d%d&g)          }|dz  }	                    |          V  t%          d*d |||t          d|j        g !          "          }|dz  }|dz  }	                    |          V  t)          d+||t          d dd,d|j        gg -                    }|dz  }|dz  }	                    |          V  t-          d.|t          d	 |	d,                    d          dddii|j        gdg                     dd          d                    d          /                    }|dz  }	                    |          V  |                                 n# t2          $ r }t4                              d0t9          |                      t;          d1|t9          |          2          }|dz  }	                    |          V  t=          d3|t          d	 |	d4                    d          dddiig dg dd                    d          t?          d5t9          |          6          7                    }|dz  }	                    |          V  Y d }~nd }~ww xY w|                                 d S # |                                 w xY w)8NFr  r   Tr	  c                  :     j         di | }|j        _        d S r  r  r  s     r[   r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cache  r  rZ   r  zresponse.createdresp_queuedr6  formatr  rf   rX  r   r   rw   )rJ  r  r  r   r6  rf   rM  r  rt  r   rn   rw   )r  sequence_numberrX  r%   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )rJ  r  r  rB  rA  )r  r>  output_indexitemzresponse.content_part.addedoutput_textr   )r  rf   annotations)r  item_idr>  rA  content_indexpartr  zresponse.output_text.deltagX@)tokenlogprob)r  rE  r>  rA  rF  rH  rv   zresponse.output_text.done)r  rE  r>  rA  rF  rf   rv   zresponse.content_part.donezresponse.output_item.done	completed)rJ  r  r  rB  rA  rD  zresponse.completed)rJ  r  r  r   r6  rf   rt  rM  r  r   rn   rw   z"Exception in response generation: r)  )r  r>  r  zresponse.failedfailedserver_error)coder  )rJ  r  r  r   r6  rf   rt  rM  r  r   rn   rw   r)  ) r  r  r  r   r   rT  r@   r<   r   rZ  rD   rE   rG   r>   rH   r  rI   rJ   r?   rf   rF   rG  r=   rB  r  r  r   r)  rW   rB   rC   rA   )r  r#  r$  r%  r  r&  r>  rA  rF  r  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr'  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr/  error_eventresponse_failedr+  r   r  r   r@  r   s                           r[   stream_responsez7ServeCommand.generate_response.<locals>.stream_response  sn      J M5<5a8>>@@@@!
 =E E E E E E #6?PQQQFOLMM!Y[[
 $8+$3%/://#-'3%(WW^%<%<&(89) !,/GG4I5,Q,Q$*!$!4!4  $ $ $ $  1$//0@AAAAA'>/$3%/://#-,3%(WW^%<%<&(89) !,/GG4I5,Q,Q$*!$!4!4  ( ( ($$  1$//0DEEEEE .J5$3!-..*..Y}[fpr  	. . .*  1$//0JKKKKK /L6/://$3!-"/+RUWXXX/ / /+  1$//0KLLLLL & P PF5<#=a#@#F#F#H#HHH!'!4!4\!B!Bv%G " %(G33).J&(G$$1G9 3z 3 3(7%1&3$,.4"@"@!A2 2 2. $q(O334NOOOOOO -B4/://$3!-"# (*t<<=- - -)  1$//0IJJJJJ .J5/://$3!-"/+E^Ecqsttt. . .*  1$"//0JKKKKK -H4$3!-..*..&*(!;!@ A$&  	- - -)  1$!//0IJJJJJ &<-$3%/://#-*3%(WW^%<%<&(89 9 >?) ,/GG4I5,Q,Q$*!$!4!4  & & &"$  1$//0BCCCCC !A !A !AJ#a&&JJKKK0 $3FF  
  1$//<<<<<"5*$3%/://#-'3%(WW^%<%<&(89!) ,1$*!$!4!4+!/$'FF    # # #,  1$//@@@@@@@@@@C!AH s2   OP T: T!!C6TT: T!!T: :U)r  r  r  r  rW   r  r  r  r   r  r  r   r   r  r  r  r"   r   rQ   r.  r  r/  r0  r   	ones_like)r   r   r  r   r  r  r2  rQ   r  r3  rY  r+  r   r  r@  s   ``         @@@@r[   rw  zServeCommand.generate_response  s    !% 7 7G E E2doE/889NOOyc'lC(( 	FM[_bMbMbxC4GHHIIhjFMM6c'lCCDDDDGd++ 	F$$w<?6*h66'/C<OPP`SVW^S_`FF \F+.~+>F1Ii((WGd++ 	FM[_bMbMbxC4GHHIIhjFMM#g,''''DEEE..vTbf.gg5<((WW3W==
 #u|1!4::<<<<"'2 3
 
 

 >c[`[rsss$$ 	3-? 	3(7799Gk"(,w66 $ 2 #of55+!2'+,
 
`	 `	 `	 `	 `	 `	 `	 `	 `	 `	D 2J???rZ   c                 F  
 t                      st          d          |                     |d                   }|                     |          \  t	          j        dd          }t          |j                  }j        j	        }t          j        |d                   }t          j        ||d          \  }} ||d	                              j                  

d
                             j                  
d
<   ||dd
fd}	 |	            S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr   r  rb   )srmonor  )sampling_rater  input_features)r  rQ   r  c               3      K    j         di }                     | j        d          d         }t          |          }|                    d           V  d S )NT)r  r   )rf   rO  rY   )r  batch_decode	sequencesr3   rU  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr+  s      r[   _generate_transcriptionzDServeCommand.generate_transcription.<locals>._generate_transcription  s      0K0UU<UCTUUM!0!=!=m>Uko!=!p!pqr!s)/ABBBM"222EEGGGGGGrZ   )r   r  r  load_audio_model_and_processorr"   rF  r   rQ   feature_extractorr^  ior   librosaloadr  r   r   )r   r   r  r2  rQ   model_sampling_rateaudio_bytesaudio_array_ri  rf  rg  rh  r+  s             @@@@r[   r  z#ServeCommand.generate_transcription  s    $%% 	o   !% 7 7G E E'+'J'JK`'a'a$_2%4T
 
 
 >)F
 
 

 .?MjV-- k6IPTUUUQ&{BUfjkkknn
 
 *66F)G)J)J;K\)])]%& ,!2'+
 
	H 	H 	H 	H 	H 	H 	H 	H '&(((rZ   c                 N   |                     d          p|                     d          }d}| j        d}ngt          | j                  t          |          k    rd}n?t          t          | j                            D ]}| j        |         ||         k    rd} n|| _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r5  TNF)r   r  lenrange)r   r   r  req_continues_last_messagesis        r[   r.  zServeCommand.is_continuation  s     77:&&:#'''*:*:&*# %*/''#$$H55*/'' 3t12233  %a(HQK7727/E 8 &**rZ   r(   c                     | j         r)t          d| j        | j        | j        | j                  }n| j        rt          d          }nd}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r(   r   r   r   r   )r   quantization_configs     r[   get_quantization_configz$ServeCommand.get_quantization_config   sv      	'"4!'+z$($<*.*C'+z# # #  	'"4!# # # #'""rZ   model_idc                 H    | j         j        | j         j        }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        N@z@main)r   r   )r   r~  s     r[   r  zServeCommand.process_model_name  s6     9 ,y,H(??O!!!!rZ   r  c                 &   | j         }t                              d|            d|v r|                    dd          \  }}n|d}}t	          j        |||j                  }|j        dv r|j        nt          t          |j                  }| 
                    |          }||j        |d|j        d}|||d
<   t          j        |fi |}	t          t          |	j        d                   }
 |
j        |fi |}t          |dd	          |                    |j                  }|j        j        d	u o|j        j        dk    }|j        j        d	uo|j        j        dk     }|s|rd|j        _        t                              d|            ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading r  r%   main)revisionr   )r   Nr   )r  r   r   
device_mapr   Nr|  r   hf_device_map   rz  zLoaded model )r   r   r   r  r'   from_pretrainedr   r   getattrr   r}  r   r   r  r  r  r   rQ   r   
max_length)r   r  r   r~  r  data_processorr   r|  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokenss                 r[   _load_model_and_data_processorz+ServeCommand._load_model_and_data_processor.  s    y6466777'''!6!<!<S!!D!DHhh!6hH&6"4
 
 
 #jN::

tz@Z@Z"::4@@ !#'#; !%!7
 
 *2EL./+HEEEE|V-A!-DEE,,XFFFF5/4008HHT[))E #2d:gu?V?aeg?g 	 #2$>p5CZCilpCp 	! " 	:%= 	:59E#2;$9;;<<<n$$rZ   c                 Z   || j         vs| j         |                                         r=|                     |          \  }}t          || j        j        |          | j         |<   nC| j         |                                          | j         |         j        }| j         |         j        }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r  r   r  r   r   r   r   r   r   )r   r  r   r   s       r[   r  z%ServeCommand.load_model_and_processork  s     !(:::d>PQf>g>r>r>t>t:#BBCXYYE98B $	 7#9 9 9D455 45AACCC&'<=CE*+@AKIirZ   c                 Z   || j         vs| j         |                                         r=|                     |          \  }}t          || j        j        |          | j         |<   nC| j         |                                          | j         |         j        }| j         |         j        }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   r  rg  rh  s       r[   rj  z+ServeCommand.load_audio_model_and_processor  s     !(:::d>PQf>g>r>r>t>t:+/+N+NOd+e+e(K8B $	 7)9 9 9D455 45AACCC,-BCIK"01FGQOO++rZ   )r   NNNNNNN).rS   rT   rU   staticmethodr   r  r   r   r  r   r&  r1  r7  r;  r?  rW   r   r   r  r   r    rW  rZ  r  	functoolscacheanyr  r   rq  r   r  r  r   rs  rw  r  re   r.  r}  r  r  tupler  r!   rj  rY   rZ   r[   r   r     s?       	>N 	> 	> 	> \	>+^ + + + +Z// / !	/
 / / / /b
 
 
 
 

 
 
 
 

d 
 
 
 
 !%#"'+<@047;6G 6G6G #6G }	6G
 sm6G  }6G T"7896G  -6G 346G 
6G 6G 6G 6GpJ[ JS J J J J ]b ]b ]b~ _+T#s(^ 4 + + + _+ZZ;t Z; Z;Q_`cei`iQj Z; Z; Z; Z;x 	"3 	 	 	 	 \	 + x +  +  +  \+ ZGGD GGYsD$5O GG GG GG GGRc@T c@iT4.H c@ c@ c@ c@J	.)$ .)9S$_3M .) .) .) .)`+4 +D + + + +< #n #BV9W # # # \#8"3 "3 " " " "";%C ;% ;% ;% ;%z %( 	 "99	:       6,C ,ERcesRsLt , , , , , ,rZ   r   __main__)r  r  r   r  enumr  r   rl  r   r  r  r   rT  r  argparser   r   collections.abcr   r   r   
contextlibr   dataclassesr	   r
   r   r   typingr   r   r   huggingface_hubr   huggingface_hub.constantsr   tokenizers.decodersr   r  &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r    r!   r"   utilsr#   r$   r&   r   r'   r(   r)   r*   generation.continuous_batchingr+   r,   rm  PILr-   r  r  r  r.   r/   fastapi.middleware.corsr0   fastapi.responsesr1   r2    openai.types.audio.transcriptionr3   .openai.types.audio.transcription_create_paramsr4   openai.types.chatr5   'openai.types.chat.chat_completion_chunkr6   r7   r8   r9   r:   *openai.types.chat.completion_create_paramsr;   openai.types.responsesr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   -openai.types.responses.response_create_paramsrK   pydanticrL   rM   rN   rP   r_   ra   r4  r9  r=  r5  r:  r>  r  rS   r   r  r  r'  r-  r  Enumr   r   r  r   r   r   r   r   r  r  rY   rZ   r[   <module>r     s            				 				  				        . . . . . . . . ? ? ? ? ? ? ? ? ? ? * * * * * * ( ( ( ( ( ( ( (             - - - - - - - - - - & & & & & & 4 4 4 4 4 4 , , , , , ,                                        0 / / / / / / / ( ( ( ( ( (  
ZLLL            ZYYYYYYY NNN  k 4 4 6 6k;O;O;Q;QkViViVkVk    qNNN........666666AAAAAAAA>>>>>>\\\\\\<<<<<<              [ZZZZZ                                 " \[[[[[@@@@@@@@@@    4QY^        6U]b        0MUZ     %%NOO&;'RSS)k*OPP   % % %!.# # # 
	H	%	%
    !D!2!7!7!9!9::     ty   	    8	8/8 	8 8 8 8v       /@ /@ /@ /@ /@ /@ /@ /@d n n n n n n n nbG, G, G, G, G,- G, G, G,T& zLNNE	IIKKKKK rZ   