
    .`i2Q                        U d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZmZ d dlmZmZmZ d dlZd dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ erd dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' n
e(Z e(Z"e(Z$e(Z&e(Z' ee)          Z* e j+        dd          Z,e j+        e-dz           e.d<   de-dz  fdZ/e	de-ded         fd            Z0e G d d                      Z1dede2e-e2e-e3f         f         fdZ4e	ddde-fd            Z5 ed           Z6 ed!e e "          Z7 ed#e$e$"          Z8 ed$%           G d& d                      Z9 G d' d(          Z:dS ))    N)abstractmethod)	GeneratorMapping)contextmanager)	dataclassfield)TYPE_CHECKINGAnyoverload)TypeVar)init_logger)TokenizerLike)cached_processor_from_config) get_allowed_kwarg_only_overrides)JSONTreejson_map_leavesPretrainedConfigBatchFeatureProcessorMixin)ModelConfigObservabilityConfig_request_id_context)defaultreturnc                  4    t                                           S )z:Get the current request_id from the context, if available.)r   get     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/multimodal/processing/context.pyget_current_request_idr#   /   s    ""$$$r!   
request_id)NNNc              #      K   t                               |           }	 dV  t                               |           dS # t                               |           w xY w)z>Context manager to set the request_id for the current context.N)r   setreset)r$   tokens     r"   set_request_idr)   4   s^        ##J//E)!!%(((((!!%((((s	   > Ac                       e Zd ZU dZdZeed<   	 dZeed<   	 dZeed<   	 dZ	eed<   	 dZ
eed<   	 deeef         fd	Zd
S )MultiModalProcessorTimingStatsz>Per-request timing statistics for multimodal processor stages.        hf_processor_timehashing_timecache_lookup_timeprompt_update_timepreprocessor_total_timer   c                 D    | j         | j        | j        | j        | j        dS )z5Convert stats to a dictionary for JSON serialization.r-   r.   r/   r0   r1   r3   selfs    r"   to_dictz&MultiModalProcessorTimingStats.to_dictQ   s1     "&!7 -!%!7"&"9'+'C
 
 	
r!   N)__name__
__module____qualname____doc__r-   float__annotations__r.   r/   r0   r1   dictstrr6   r    r!   r"   r+   r+   >   s         HH"u""">L%@"u"""; ####P%(U(((-
c5j) 
 
 
 
 
 
r!   r+   engine_clientc                    	 | j         j        j        si S n# t          t          f$ r i cY S w xY wi }	 | j        }|j        }t          |d          rF|                                }|0t          |d          r |j	        j
        }|                                }n# t          t          f$ r Y nw xY wi }	 t          | d          r|                     d          }|rt          |          dk    r|D ]}|s|                                D ]\  }	}
|	|vrt          |
          ||	<   ||	                             dd          }|
                    dd          }t#          ||          ||	         d<   ||	                             d	d          }|
                    d	d          }t#          ||          ||	         d	<   n# t          t          f$ r Y nw xY wi }|                                D ]\  }	}t          |          ||	<   |                                D ]t\  }	}|	|v r||	                             |           %|	                    d
          d         }|r ||v r||                             |           bt          |          ||	<   u|S )a5  
    Get all multimodal timing stats from the engine client.

    Collects both preprocessing stats (HF processor, hashing, cache lookup,
    prompt update) and encoder forward pass timing, merged by request_id.

    Args:
        engine_client: The engine client (has input_processor and workers).

    Returns:
        Dictionary mapping request_id to merged stats dict containing
        both preprocessing and encoder timing metrics.

    Example:
        {
            'request-123': {
                'hf_processor_time': 0.45,
                'hashing_time': 0.02,
                'cache_lookup_time': 0.01,
                'prompt_update_time': 0.03,
                'preprocessor_total_time': 0.51,
                'encoder_forward_time': 0.23,
                'num_encoder_calls': 1
            }
        }
    _get_mm_processorNinfocollective_rpcget_encoder_timing_statsr   encoder_forward_timer,   num_encoder_calls-)vllm_configobservability_configenable_mm_processor_statsAttributeErrorRuntimeErrorinput_processorinput_preprocessorhasattrrA   rB   ctxget_all_timing_statsrC   lenitemsr=   r   maxupdate
rpartition)r?   preprocessing_statsrM   rN   mm_processorrP   encoder_statsencoder_stats_resultsworker_statsr$   
stats_dictcurrent_timenew_timecurrent_calls	new_callsmerged_stats	prep_dictenc_dictpossible_original_ids                      r"   #get_timing_stats_from_engine_clientre   \   s[   :(=W 	I	L)   			 
'7,?%':;; 	A-??AAL'GL&,I,I'"'+&)&>&>&@&@#L)    M="233 	$1$@$@*% %! % -B)C)Ca)G)G$9  L' ! 2>2D2D2F2F  .
J%]::8<Z8H8HM*55 ,9+D+H+H 6, ,L (2~~6Lc'R'RHPS ,hQ QM*56LM -:*,E,I,I 3Q- -M )37JA(N(NIMP -yN NM*56IJJ#( L)    L!4!:!:!<!< 3 3
I#'	??Z   - 3 3 5 5 6 6
H%%$++H555  *44S99!< 	6$8L$H$H-.55h????'+H~~L$$s1    ,,A$B B+*B+1D	F; ;GGrP   InputProcessingContext
stage_namec              #     K   t                      }| |dV  dS |                     |          }|dV  dS t          j                    }	 dV  t          j                    |z
  }|dk    r|xj        |z  c_        nD|dk    r|xj        |z  c_        n-|dk    r|xj        |z  c_        n|dk    r|xj        |z  c_        |xj        |z  c_        dS # t          j                    |z
  }|dk    r|xj        |z  c_        nD|dk    r|xj        |z  c_        n-|dk    r|xj        |z  c_        n|dk    r|xj        |z  c_        |xj        |z  c_        w xY w)aZ  
    Context manager to time an operation using the context's timing stats.

    The request_id is automatically retrieved from the context variable,
    so it doesn't need to be passed as a parameter.

    Args:
        ctx: The InputProcessingContext containing the timing stats registry.
        stage_name: Name of the stage being timed.
    Nhf_processorhashingcache_lookupprompt_update)	r#   get_timing_statstimeperf_counterr-   r.   r/   r0   r1   )rP   rg   r$   stats
start_timeelapseds         r"   timed_preprocessor_operationrs      s      ())J
{j(  ,,E}"$$J1#%%
2''##w.###9$$')>))##w.###?**$$/$$%%0%%%% #%%
2''##w.###9$$')>))##w.###?**$$/$$%%0%%%%%%s   C BE_T_C)boundr   _PT)frozenc                   L   e Zd ZU dZeed<   	 edz  ed<   	  eddd          Zded<   	  ee	dd	          Z
e	eef         ed
<   	  eej        dd	          Zej        ed<   	 defdZedefd            Zedee         eee         df         z  defd            Z	 d)dee         eee         df         z  dz  defdZde	eef         fdZd Zededefd            Zedee         eee         df         z  dedefd            Z	 d)dee         eee         df         z  dz  dedefdZdee         dedefdZde de fdZ!i fdddded e"eef         de"eef         d!e#d"e#de$e z  fd#Z%d$ededz  fd%Z&d$edefd&Z'de#fd'Z(de	ee	ee)f         f         fd(Z*dS )*rf   zZ
    Contains information about the model which may be used to
    modify the inputs.
    model_configN	tokenizerF)r   comparereprzObservabilityConfig | NonerI   )default_factoryr|   r}   timing_stats_registry_timing_stats_registry_lockr   c                 <    | j         t          d          | j         S )Nz<You cannot pass text prompts when `skip_tokenizer_init=True`)r{   
ValueErrorr4   s    r"   get_tokenizerz$InputProcessingContext.get_tokenizer  s)    >!N   ~r!   c                    d S Nr    r4   s    r"   get_hf_configz$InputProcessingContext.get_hf_config  s    47Cr!   typ.c                    d S r   r    )r5   r   s     r"   r   z$InputProcessingContext.get_hf_config  s	    
 Sr!   c                    |ddl m} |}| j        j        }t	          ||          s"t          d| dt          |                     |S )z
        Get the HuggingFace configuration
        (`transformers.PretrainedConfig`) of the model,
        additionally checking its type.

        Raises:
            TypeError: If the configuration is not of the specified type.
        Nr   r   z3Invalid type of HuggingFace config. Expected type: z, but found type: ) transformers.configuration_utilsr   rz   	hf_config
isinstance	TypeErrortype)r5   r   r   r   s       r"   r   z$InputProcessingContext.get_hf_config  s|     ;IIIIII"C%/	)S)) 	1"%1 1#I1 1   r!   c                     | j         j        S )zQ
        Get the HuggingFace image processor configuration of the model.
        )rz   hf_image_processor_configr4   s    r"   get_hf_image_processor_configz4InputProcessingContext.get_hf_image_processor_config9  s      ::r!   c                 @    | j         j        }|t          d          |S )z
        Get the multimodal config of the model.

        Raises:
            RuntimeError: If the model is not a multimodal model.
        NzNot a multimodal model)rz   multimodal_configrL   )r5   	mm_configs     r"   get_mm_configz$InputProcessingContext.get_mm_config?  s*     %7	7888r!   kwargsc                    d S r   r    r5   r   s     r"   get_hf_processorz'InputProcessingContext.get_hf_processorL  s    GJsr!   c                    d S r   r    )r5   r   r   s      r"   r   z'InputProcessingContext.get_hf_processorO  s	     Sr!   c                    |ddl m} |}ddlm} | j        }t          ||          r|j        }t          | j        f||d|S )z
        Get the HuggingFace processor
        (`transformers.ProcessorMixin`) of the model,
        additionally checking its type.

        Raises:
            TypeError: If the processor is not of the specified type.
        Nr   r   )MistralTokenizer)processor_clsr{   )	transformers.processing_utilsr   vllm.tokenizers.mistralr   r{   r   transformers_tokenizerr   rz   )r5   r   r   r   r   r{   s         r"   r   z'InputProcessingContext.get_hf_processorW  s     ;DDDDDD C<<<<<<N	i!122 	9!8I+

 
 	
 
 	
r!   c                f    | j                                         }|j        }|i }i ||} |di |S )z
        Initialize a HuggingFace-like processor class, merging the
        keyword arguments with those in the model's configuration.
        Nr    )rz   get_multimodal_configmm_processor_kwargs)r5   r   r   r   base_kwargsmerged_kwargss         r"   init_processorz%InputProcessingContext.init_processorw  sP     %;;==	3K1;1&1s##]###r!   outputc                 >     dt           f fd}t          ||          S )Nxc                     t          | t          j                  r4|                                 r |                     j        j                  } | S )N)dtype)r   torchTensoris_floating_pointtorz   r   )r   r5   s    r"   _postprocess_onezDInputProcessingContext._postprocess_output.<locals>._postprocess_one  sI    !U\** <&&(( <4#4#:;;AHr!   )objectr   )r5   r   r   s   `  r"   _postprocess_outputz*InputProcessingContext._postprocess_output  s;    	 	 	 	 	 	 	 /888r!         	num_tries	max_triesri   datar   r   c                >   t          |          sJ | j                                        }|                    |          }t	          ||dd          }	  |di ||ddi}	n# t
          $ r}
t          |
t                    rk|
ri|
j        d         dk    rX||k     rRt          
                    d||           t          j        d	           |                     ||||d
z   |          cY d}
~
S dt          |          j         d| d| }t!          |          |
d}
~
ww xY wddlm} t          |	|          r%|                     |	j                  } ||          S t                              dt          |          j                   |                     |	          S )z~
        Call `hf_processor` on the prompt `data`
        (text, image, audio...) with configurable options `kwargs`.
        FT)requires_kw_onlyallow_var_kwargsreturn_tensorsptr   zAlready borrowedzBFailed to acquire tokenizer in current thread. Retrying (%d/%d)...g      ?r   r   NzFailed to apply z	 on data=z with kwargs=r   z{%s did not return `BatchFeature`. Make sure to match the behaviour of `ProcessorMixin` when implementing custom processors.r    )callablerz   r   merge_mm_processor_kwargsr   	Exceptionr   rL   argsloggerwarningrn   sleepcall_hf_processorr   r7   r   %transformers.feature_extraction_utilsr   r   r   warning_once)r5   ri   r   r   r   r   r   r   allowed_kwargsr   excmsgr   output_s                 r"   r   z(InputProcessingContext.call_hf_processor  s8    %%%%%%;;==	!;;FCC9"!	
 
 
	+!\PPDPNPP4PPPFF 	+ 	+ 	+ 3-- HQK#555	))*	   
3-- '!m' .        ?4#5#5#> ? ?? ?.<? ? 
 S//s*9	+> 	GFFFFFfl++ 	)..v{;;G<(((. '		
 	
 	
 ''///s%   A# #
D-A:D'D--DDr$   c                     | j         | j         j        sdS | j        5  | j                            |          cddd           S # 1 swxY w Y   dS )z1
        Get timing stats for a request.
        N)rI   rJ   r   r   r   )r5   r$   s     r"   rm   z'InputProcessingContext.get_timing_stats  s     %-,F . 4- 	> 	>-11*==	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   AAAc                     | j         | j         j        st                      S | j        5  || j        v rt          d|           t                      }|| j        |<   |cddd           S # 1 swxY w Y   dS )z
        Create and store timing stats in the registry for a request.

        This should be called at the start of processing for a request.
        The stats object is created immediately and stored in the registry.
        Nz+Timing stats already exist for request_id: )rI   rJ   r+   r   r   r   )r5   r$   rp   s      r"   create_timing_statsz*InputProcessingContext.create_timing_stats  s     %-,F . 2333- 	 	T777 N*NN   344E5:D&z2	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   5A++A/2A/c                     | j         | j         j        sdS | j        5  t          | j                  }| j                                         |cddd           S # 1 swxY w Y   dS )zY
        Clear all stats from the registry. Returns the number of stats cleared.
        Nr   )rI   rJ   r   rR   r   clear)r5   counts     r"   clear_timing_stats_registryz2InputProcessingContext.clear_timing_stats_registry  s    
 %-,F . 1- 	 	233E&,,...	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   /AA Ac                     | j         | j         j        si S | j        5  d | j                                        D             cddd           S # 1 swxY w Y   dS )zI
        Get all timing stats as a dictionary for API endpoints.
        Nc                 >    i | ]\  }}||                                 S r    )r6   ).0ridrp   s      r"   
<dictcomp>z?InputProcessingContext.get_all_timing_stats.<locals>.<dictcomp>  s6       C U]]__  r!   )rI   rJ   r   r   rS   r4   s    r"   rQ   z+InputProcessingContext.get_all_timing_stats  s    
 %-,F . I- 	 	 "&"<"B"B"D"D  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   #AAAr   )+r7   r8   r9   r:   r   r<   r   r   rI   r=   r   r>   r+   	threadingLockr   r   r   r   r   r   ru   tupler
   r   r   r   r   r   rw   rt   r   r   r   r   intr   r   rm   r   r   r;   rQ   r    r!   r"   rf   rf      s8         
 )t####49>e%: : :6    4GLue%H H H4%C CD    A27%!E3 3 3    @}     7"2777 X7"Xd2hm,, 
	   X 9= #YtCy#~..5 
	   8;tCH~ ; ; ; ;   JFJ~JJJ XJ"Xd2hm,, 	
 
   X 9=
 
#YtCy#~..5
 	

 

 
 
 
@$"X$ 	$
 
$ $ $ $&99 
9 9 9 9$ (*	G0 G0 G0 G0$G0 c6k"G0 V$	G0 G0 G0 
	 G0 G0 G0 G0R>>	'$	.> > > >c 6T    ,S    d3S%Z0@+@&A      r!   c                       e Zd ZdZdeddf fdZedefd            Zde	fdZ
defdZd	edefd
Zedefd            Zedeeedz  f         fd            Zdeeef         fdZdedeeef         deeef         dz  fdZ xZS )BaseProcessingInfozDBase class to provide the information necessary for data processing.rP   r   Nc                 V    t                                                       || _        d S r   )super__init__rP   )r5   rP   	__class__s     r"   r   zBaseProcessingInfo.__init__&  s$    r!   c                 $    | j         j        j        S r   )rP   rz   modelr4   s    r"   model_idzBaseProcessingInfo.model_id+  s    x$**r!   c                 4    | j                                         S r   )rP   r   r4   s    r"   r   z BaseProcessingInfo.get_tokenizer/      x%%'''r!   c                 4    | j                                         S r   )rP   r   r4   s    r"   r   z BaseProcessingInfo.get_hf_config2  r   r!   r   c                 &     | j         j        di |S )zy
        Subclasses can override this method to handle
        specific kwargs from model config or user inputs.
        r    )rP   r   r   s     r"   r   z#BaseProcessingInfo.get_hf_processor5  s    
 )tx(226222r!   c                     dS )NFr    r4   s    r"   skip_prompt_length_checkz+BaseProcessingInfo.skip_prompt_length_check<  s    ur!   c                     t           )z
        Return the maximum supported number of items for each modality.

        A value of `None` means unlimited number of items.

        Omitting a modality from the returned dictionary means that
        it is not supported at all.
        )NotImplementedErrorr4   s    r"   get_supported_mm_limitsz*BaseProcessingInfo.get_supported_mm_limits@  s
     "!r!   c                 ,   |                                  }| j                                        }t          t          t
          f                     }|                                D ]1\  }}|                    |          }||nt          ||          ||<   2|S )z=Return the maximum allowed number of items for each modality.)	r   rP   r   r=   r>   r   rS   get_limit_per_promptmin)r5   supported_mm_limitsr   allowed_limitsmodalitysupported_limit
user_limits          r"   get_allowed_mm_limitsz(BaseProcessingInfo.get_allowed_mm_limitsL  s    "::<<H**,,	c3h)))<)B)B)D)D 	 	%Ho"77AAJ #* 
_55 8$$ r!   seq_len	mm_countsc                     dS )a  
        Return the maximum number of tokens per item of for each modality.

        When `None` (the default) is returned, vLLM will generate dummy inputs
        (images/videos) at maximum possible sizes and process them to determine
        the maximum token count per modality.

        This approach works but can be very slow for certain models (e.g.,
        Qwen2.5-VL), leading to very long startup time. For better performance,
        each model can override this method to return pre-computed maximum token
        counts, avoiding the need for dummy input generation and processing.

        Note:
            The maximum number of tokens per item of each modality returned
            from this function should respect the model's maximum sequence
            length and the maximum number of items of each modality allowed,
            and agree with dummy inputs (images/videos) at maximum possible
            sizes.
        Nr    )r5   r   r   s      r"   get_mm_max_tokens_per_itemz-BaseProcessingInfo.get_mm_max_tokens_per_item]  s	    0 tr!   )r7   r8   r9   r:   rf   r   propertyr>   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r   r   __classcell__)r   s   @r"   r   r   #  s       NN2 t      
 +# + + + X+(} ( ( ( ((/ ( ( ( (3 3N 3 3 3 3 $    X 	"cDj)A 	" 	" 	" ^	"wsCx'8    " 38$ 
c	T	!	       r!   r   );contextvarsr   rn   abcr   collections.abcr   r   
contextlibr   dataclassesr   r   typingr	   r
   r   r   typing_extensionsr   vllm.loggerr   vllm.tokenizersr   !vllm.transformers_utils.processorr   vllm.utils.func_utilsr   vllm.utils.jsontreer   r   r   r   r   r   r   r   vllm.configr   r   r   r7   r   
ContextVarr   r>   r<   r#   r)   r+   r=   r;   re   rs   rt   ru   rw   rf   r   r    r!   r"   <module>r     s                   . . . . . . . . % % % % % % ( ( ( ( ( ( ( (           % % % % % % # # # # # # ) ) ) ) ) ) J J J J J J B B B B B B 9 9 9 9 9 9 9 9 !AAAAAABBBBBB<<<<<<<<<<<<<<<LNK 	X		 ;Q+:P4; ; ; [+C$J7   
%d
 % % % %
 )s )y1A'B ) ) ) ) 
 
 
 
 
 
 
 
:ee	#tCJ
 e e e eP "1&> "1C "1 "1 "1 "1J WT]]WT)3CDDDWT@@@ $p p p p p p p pf	R R R R R R R R R Rr!   