
    .`i                     <   U d dl mZmZmZmZ d dlmZmZmZ d dl	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, ddl-m.Z.m/Z/ e
rd dlm0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 n
e9Z0e9Z2e9Z4e9Z6e9Z8 e$e:          Z;e<e         ez  e=edf         z  Z>ee?d<   	 dedz  defdZ@ eAejB        e.f                     ZCe G d de                      ZDe G d de                      ZEed eFe9         deeFeD                  fd!            ZGed e9deeD         fd"            ZGd eFe9         e9z  deeFeD                  eeD         z  fd#ZGd eFe9         e9z  deHfd$ZId eFe9         e9z  deHfd%ZJd eFe9         e9z  deHfd&ZKed eFe9         deeFeE                  fd'            ZLed e9deeE         fd(            ZLd eFe9         e9z  deeFeE                  eeE         z  fd)ZLe G d* d+e                      ZMed eFe9         deeFeM                  fd,            ZNed e9deeM         fd-            ZNd eFe9         e9z  deeFeM                  eeM         z  fd.ZNe G d/ d0e                      ZOe G d1 d2e                      ZPed eFe9         deeFeO                  fd3            ZQed e9deeO         fd4            ZQd eFe9         e9z  deeFeO                  eeO         z  fd5ZQd eFe9         e9z  deHfd6ZRe G d7 d8e                      ZSe G d9 d:e                      ZTed eFe9         deeFeS                  fd;            ZUed e9deeS         fd<            ZUd eFe9         e9z  deHeeFeS                  z  eeS         z  fd=ZUd eFe9         e9z  deHfd>ZVd eFe9         e9z  deHfd?ZWe G d@ dAe                      ZXed e9deeX         fdB            ZYed eFe9         deeFeX                  fdC            ZYd eFe9         e9z  deeFeX                  eeX         z  fdDZYe G dE dFe                      ZZed e9deeZ         fdG            Z[ed eFe9         deeFeZ                  fdH            Z[d eFe9         e9z  deeFeZ                  eeZ         z  fdIZ[e G dJ dKe                      Z\ed e9dee\         fdL            Z]ed eFe9         deeFe\                  fdM            Z]d eFe9         e9z  deeFe\                  ee\         z  fdNZ]e G dO dPe                      Z^d e9dee^         fdQZ_e G dR dSe                      Z`ed e9dee`         fdT            Zaed eFe9         deeFe`                  fdU            Zad eFe9         e9z  deeFe`                  ee`         z  fdVZae G dW dXe                      Zbed e9deeb         fdY            Zced eFe9         deeFeb                  fdZ            Zcd eFe9         e9z  deeFeb                  eeb         z  fd[Zce G d\ d]e                      Zded eFe9         deeFed                  fd^            Zeed e9deed         fd_            Zed eFe9         e9z  deeFed                  eed         z  fd`Zfd eFe9         e9z  deeFed                  eed         z  fdaZe G db dc          Zge G dd dee                      Zhed eFe9         deeFeh                  fdf            Zied e9deeh         fdg            Zid eFe9         e9z  deeFeh                  eeh         z  fdhZie G di dje                      Zjed eFe9         deeFej                  fdk            Zked e9deej         fdl            Zkd eFe9         e9z  deeFej                  eej         z  fdmZke G dn doeje                      Zled eFe9         deeFel                  fdp            Zmed e9deel         fdq            Zmd eFe9         e9z  deeFel                  eel         z  fdrZme G ds dteje                      Zned eFe9         deeFen                  fdu            Zoed e9deen         fdv            Zod eFe9         e9z  deeFen                  een         z  fdwZoe G dx dye                      Zped eFe9         deeFep                  fdz            Zqed e9deep         fd{            Zqd eFe9         e9z  deeFep                  eep         z  fd|Zqe G d} d~e                      Zred eFe9         deeFer                  fd            Zsed e9deer         fd            Zsd eFe9         e9z  deeFer                  eer         z  fdZsdS )    )CallableIterableMappingMutableSequence)	ExitStackcontextmanagernullcontext)TYPE_CHECKINGClassVarLiteralProtocol	TypeAliasoverloadruntime_checkableN)Tensor)	LANGUAGES)SelfTypeIs)ModelConfigSpeechToTextConfig)TokensPrompt)
PromptType)init_logger)MambaStateCopyFunc)QuantizationConfig)common_prefix)supports_kw   )	VllmModelis_pooling_model
VllmConfig)WeightsMapper)MultiModalFeatureSpec)_ProcessorFactories)IntermediateTensors.MultiModalEmbeddingsis_multimodalreturnc                 (    | t          d          | S )z
    A helper function to be used in the context of
    [vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][]
    to provide a better error message.
    Nz`embed_input_ids` now requires `is_multimodal` arg, please update your model runner according to https://github.com/vllm-project/vllm/pull/16229.)
ValueError)r(   s    y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/interfaces.py_require_is_multimodalr-   ;   s)     ?
 
 	
     c                      e Zd ZU dZdZeed                  ed<   	 dZee	         ed<   	 dZ
ee	         ed<   	 dZee	         ed<   	 ee         ed<   	 g Zee         ed	<   	 g Zee         ed
<   	 ededededz  fd            ZdedefdZdefdZedddedeej                 eeej                 df         z  dz  fd            Zedddede e         ez  deej                 eeej                 df         z  dz  fd            Z!ededeej                 eeej                 df         z  de"eeej                 eeej                 df         z  f         fd            Z#dedefdZ$dedefd Z%e&d!e'de'fd"            Z(e&dd#d!e'd$ed%e)j'        d&e	de'f
d'            Z(d!e'd(e*e'ge'f         d%e'dz  d&e	de'f
d)Z+	 d,ddd*d!e'd$edz  d%e'dz  d&e	de'f
d+Z(dS )-SupportsMultiModalz2The interface required for all multi-modal models.Tsupports_multimodalF"supports_multimodal_raw_input_onlysupports_encoder_tp_datarequires_raw_input_tokens_processor_factory_language_model_names_tower_model_namesmodalityir)   Nc                     dS )zW
        Get the placeholder text for the `i`th `modality` item in the prompt.
        N )clsr8   r9   s      r,   get_placeholder_strz&SupportsMultiModal.get_placeholder_str}   	    
 	r.   kwargsc                     dS )a?  
        Returns multimodal embeddings generated from multimodal kwargs
        to be merged with text embeddings.

        Note:
            The returned multimodal embeddings must be in the same order as
            the appearances of their corresponding multimodal data item in the
            input prompt.
        Nr;   )selfr?   s     r,   embed_multimodalz#SupportsMultiModal.embed_multimodal   s	     	r.   c                    | t           v rt           |          S | j        rU| }t          d | j        D                       D ]}|rt          ||          }|| urt	          |d          r|t           | <   |S |                                 D ] }t	          |d          r|t           | <   |c S !t          dt          |           j         d          )a=  
        Returns the underlying language model used for text generation.

        This is typically the `torch.nn.Module` instance responsible for
        processing the merged multimodal embeddings and producing hidden states

        Returns:
            torch.nn.Module: The core language model component.
        c                 8    g | ]}|                     d           S ).)split).0names     r,   
<listcomp>z9SupportsMultiModal.get_language_model.<locals>.<listcomp>   s"    HHHTCHHHr.   embed_input_idszNo language model found in z6! You should initialize it via `_mark_language_model`.)	_language_model_by_moduler6   r   getattrhasattrchildrenNotImplementedErrortype__name__)rA   modattrs      r,   get_language_modelz%SupportsMultiModal.get_language_model   s    ,,,,T22% 
	C%HHT-GHHH  - -  -!#t,,C$730A#B#B25)$/
 ==?? 	 	Cs-.. 25)$/


 "C$t***= C C C
 
 	
r.   targetsvllm_configrV   .c             #     K   ddl mm}m} |j        j        } || |          5 }|j        r || fd|          nt                      5  dV  ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   || _        dS )ao  
        Mark each child module that was assigned to this model during this context
        as a language model component.

        Language model components are automatically skipped in `--mm-encoder-only`
        mode.

        If `targets` is set, instead include descendants that are an instance
        of `targets`, even if they aren't direct children.
        r   StageMissingLayercollect_childrenno_init_weightsrU   c                      d|           S )Nlanguage_modelr;   )rR   rZ   s    r,   <lambda>z9SupportsMultiModal._mark_language_model.<locals>.<lambda>   s     1 12BC H H r.   N)	utilsrZ   r[   r\   model_configmultimodal_configmm_encoder_onlyr	   r6   )rA   rW   rV   r[   r\   	mm_configchildren_namesrZ   s          @r,   _mark_language_modelz'SupportsMultiModal._mark_language_model   sN     " 	POOOOOOOOO,>	dG444 
	 ,#HHHH#    !]]	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 &4"""s5   'A6AA6A#	#A6&A#	'A66A:=A:
modalitiesc             #     	K   ddl mm}m} t	          |t
                    r|h}|ddhk    rd	nd                    g |d          	|j        j         || |          5 }t          fd	|D                       r || 	fd
|          nt                      5  dV  ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   || _        dS )a  
        Mark each child module that was assigned to this model during this context
        as a tower model component.

        Tower model components are automatically skipped when `--limit-mm-per-prompt`
        is set to zero for all of their modalities.

        If `targets` is set, instead include descendants that are an instance
        of `targets`, even if they aren't direct children.
        r   rY   imagevideovision_tower_towerrU   c              3   J   K   | ]}                     |          d k    V  dS )r   N)get_limit_per_prompt)rG   mrd   s     r,   	<genexpr>z7SupportsMultiModal._mark_tower_model.<locals>.<genexpr>   s6      RR!y55a88A=RRRRRRr.   c                      |           S Nr;   )rR   rZ   
stage_names    r,   r_   z6SupportsMultiModal._mark_tower_model.<locals>.<lambda>   s     1 1*c B B r.   N)r`   rZ   r[   r\   
isinstancestrjoinra   rb   allr	   r7   )
rA   rW   rg   rV   r[   r\   re   rZ   rd   rt   s
          @@@r,   _mark_tower_modelz$SupportsMultiModal._mark_tower_model   s     $ 	POOOOOOOOOj#&& 	&$J'7+++'JJ"8J"8"899J,>	dG444 
	 RRRRzRRRRR#BBBBB#    !]]	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 #1s6   %<C	!B2&C	2B6	6C	9B6	:C		CClanguage_targetstower_targetsc          	   #   >  K   t                      5 }|                    |                     ||                     |                                D ]0\  }}|                    |                     |||                     1dV  ddd           dS # 1 swxY w Y   dS )zl
        Composite wrapper over `_mark_language_model` and
        `_mark_tower_model` by modality.
        rU   N)r   enter_contextrf   itemsry   )rA   rW   rz   r{   stackr8   modality_targetss          r,   _mark_composite_modelz(SupportsMultiModal._mark_composite_model  s      [[ 	E)), *     /<.A.A.C.C  **##**#  0 +      EEE#	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   A4BBBnum_image_tokensc                     dS )z
        Implement this function to enable LoRA support
        for the tower module of the multi-modal model.
        Given the number of image tokens, output the number of
        multi-modal encoder tokens.
        Nr;   )rA   r   s     r,   get_num_mm_encoder_tokensz,SupportsMultiModal.get_num_mm_encoder_tokens#  	     	r.   num_vision_tokensc                     dS )z
        Implement this function to enable LoRA support
        for the connector module of the multi-modal model.
        Given the number of vision tokens, output the number of
        multi-modal connector tokens.
        Nr;   )rA   r   s     r,   get_num_mm_connector_tokensz.SupportsMultiModal.get_num_mm_connector_tokens,  r   r.   	input_idsc                     d S rs   r;   )rA   r   s     r,   rJ   z"SupportsMultiModal.embed_input_ids5      <?Cr.   )handle_oov_mm_tokenmultimodal_embeddingsr(   r   c                    d S rs   r;   )rA   r   r   r(   r   s        r,   rJ   z"SupportsMultiModal.embed_input_ids8  s	     r.   rJ   c                   |rv|t| } |||                   }t          j        |j        d         |j        d         f|j        |j                                      |                    d          |          S  ||          S )Nr   r   )dtypedevice)torchemptyshaper   r   masked_scatter_
unsqueeze_)rA   r   rJ   r(   r   is_texttext_embedss          r,   _embed_text_input_idsz(SupportsMultiModal._embed_text_input_idsB  s      	C=#<$nG)/)G*<==K;#[%6q%9:!'")   og0044kBB	C y)))r.   r(   r   c                    ddl m} |                     ||                                 j        ||          }|t          |          dk    r|S  |||t          |                    S )a  
        Apply token embeddings to `input_ids`.

        If `multimodal_embeddings` is passed, scatter them into
        `input_ids` according to the mask `is_multimodal`.

        In case the multi-modal token IDs exceed the vocabulary size of
        the language model, you can set `handle_oov_mm_token=False`
        to avoid calling the language model's `embed_input_ids` method
        on those tokens. Note however that doing so increases memory usage
        as an additional buffer is needed to hold the input embeddings.
        r   )_merge_multimodal_embeddingsr   Nr   )inputs_embedsr   r(   )r`   r   r   rT   rJ   lenr-   )rA   r   r   r(   r   r   r   s          r,   rJ   z"SupportsMultiModal.embed_input_idsV  s    ( 	87777722##%%5' 3	 3 
 
 !(C0E,F,F!,K,K  ++'"70??
 
 
 	
r.   rs   ),rQ   
__module____qualname____doc__r1   r   r   __annotations__r2   boolr3   r4   r%   r6   listrv   r7   classmethodintr=   objectr'   rB   r   rT   r   r"   rP   nnModuletuplerf   setry   dictr   r   r   r   r   rJ   r   r   r   r;   r.   r,   r0   r0   O   s1        <<37'$-0777 :?&>>>
 05htn444
 16x~555
 !!45555 (*49))) %'S	&&& 3 3 3:    [
 
4H 
 
 
 
#
I #
 #
 #
 #
J 
 IM	 4  4  4 4 bi5bi#)=#>>E	 4  4  4 ^ 4D  IM)1 )1 )1)1 HsN)1
 bi5bi#)=#>>E)1 )1 )1 ^)1V  ry/E$ry/32F,GG	
 Cbi5bi#9M3N!NNO   ^<# #    S S     ??F??? X? %*    4
 | " 
   X** "6(F"23*
 }* "* 
* * * *. >B$

 (,$)$
 $
 $
$
  4d:$

 }$
 "$
 
$
 $
 $
 $
 $
 $
r.   r0   c                   ~    e Zd ZU dZdZeed                  ed<   dee	         de
dej        de	dee
ee	f         f
d	Zd
S )SupportsMultiModalPruningzThe interface required for models that support returning both input
    embeddings and positions. Model may require custom positions for dynamic
    pruning of multimodal embeddings.
    Tsupports_multimodal_pruningr   r   mrope_positionsnum_computed_tokensr)   c                     dS )am  
        Update part of input mrope positions (starting with
        num_computed_tokens index). Original mrope_positions are computed
        for unpruned sequence and becomes incorrect once pruning occurs,
        so once we prune media tokens we should reflect this in the
        mrope_positions before we feed it to LLM.

        Args:
            input_ids: (N,) All input tokens of the prompt containing
                entire sequence.
            multimodal_embeddings: Tuple of multimodal embeddings that
                fits into the prefill chunk that is being processed.
            mrope_positions: Existing mrope positions (3, N) for entire
                sequence
            num_computed_tokens: A number of computed tokens so far.

        Returns:
            Tuple of (multimodal_embeddings, mrope_positions,
                mrope_position_delta).
        Nr;   )rA   r   r   r   r   s        r,   recompute_mrope_positionsz3SupportsMultiModalPruning.recompute_mrope_positions  s	    6 	r.   N)rQ   r   r   r   r   r   r   r   r   r   r'   r   
LongTensorr   r   r   r;   r.   r,   r   r   }  s          
 <@'$-!8???9  4 )	
 ! 
#VS0	1     r.   r   modelc                     d S rs   r;   r   s    r,   r1   r1     s    RURUr.   c                     d S rs   r;   r   s    r,   r1   r1         FIcr.   c                 $    t          | dd          S )Nr1   FrL   r   s    r,   r1   r1     s     5/777r.   c                 $    t          | dd          S )Nr2   Fr   r   s    r,   r2   r2     s    5>FFFr.   c                 $    t          | dd          S )Nr4   Fr   r   s    r,   r4   r4     s    55u===r.   c                 $    t          | dd          S )Nr3   Fr   r   s    r,   #supports_multimodal_encoder_tp_datar     s    54e<<<r.   c                     d S rs   r;   r   s    r,   r   r     s	     /2cr.   c                     d S rs   r;   r   s    r,   r   r     s    UXUXr.   c                 $    t          | dd          S )Nr   Fr   r   s    r,   r   r     s     57???r.   c                       e Zd ZU dZdZeed                  ed<   	 ede	de	de	dz  fd            Z
ed	eddfd
            ZdS )SupportsScoreTemplatezBThe interface required for all models that support score template.Tsupports_score_templatequerydocumentr)   Nc                     dS )zj
        Generate a full prompt by populating the score template with query and document content.
        Nr;   )r<   r   r   s      r,   get_score_templatez(SupportsScoreTemplate.get_score_template  r>   r.   promptc                     dS )zR
        Perform architecture-specific manipulations on the input tokens.
        Nr;   )r<   r   s     r,   post_process_tokensz)SupportsScoreTemplate.post_process_tokens  r>   r.   )rQ   r   r   r   r   r   r   r   r   rv   r   r   r   r;   r.   r,   r   r     s         LL7;Xgdm4;;; s c cDj    [  $    [  r.   r   c                     d S rs   r;   r   s    r,   r   r     	     +.#r.   c                     d S rs   r;   r   s    r,   r   r         MPSr.   c                 $    t          | dd          S )Nr   Fr   r   s    r,   r   r          53U;;;r.   c                       e Zd ZU dZdZeed                  ed<   	 dZee	         ed<   dZ
ee	         ed<   i Zeeeef                  ed<   i Zeeee         f         ed<   d	S )
SupportsLoRAz8The interface required for all models that support LoRA.Tsupports_loraFis_3d_moe_weightis_non_gated_moeembedding_modulespacked_modules_mappingN)rQ   r   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   r;   r.   r,   r   r     s         BB-1M8GDM*111 (-htn,,,',htn,,, 35xS#X/44435Dd3i055555r.   r   c                   f    e Zd ZU ed         ed<   eeee         f         ed<   eeef         ed<   dS )_SupportsLoRATypeTr   r   r   N)rQ   r   r   r   r   r   rv   r   r;   r.   r,   r   r     sO         4=    d3i0000CH~%%%%%r.   r   c                     d S rs   r;   r   s    r,   r   r     r   r.   c                     d S rs   r;   r   s    r,   r   r     s    :=#r.   c                      t                     }|sjd}t           fd|D                       }t           dd          r|rt                              d |           n|st                              d            |S )N)r   r   c              3   <   K   | ]}t          |          |V  d S rs   rM   rG   rS   r   s     r,   rq   z supports_lora.<locals>.<genexpr>*  s3      VVtPTAUAUVdVVVVVVr.   r   FzUThe model (%s) sets `supports_lora=True`, but is missing LoRA-specific attributes: %sz\The model (%s) contains all LoRA-specific attributes, but does not set `supports_lora=True`.)_supports_lorar   rL   loggerwarning)r   result
lora_attrsmissing_attrss   `   r,   r   r      s     E""F 

 VVVVzVVVVV5/511 	 B!	   ! =   Mr.   c                     t          | t                    rt          | t                    S t          | t                    S rs   )ru   rP   r   r   r   s    r,   r   r   ?  s5    % 4%!2333e\***r.   c                       e Zd ZU dZdZeed                  ed<   	 dede	j
        de	j        defdZd	ed
z  ded
z  fdZd
S )
SupportsPPzEThe interface required for all models that support pipeline parallel.Tsupports_pp
batch_sizer   r   r)   c                     dS )z/Called when PP rank > 0 for profiling purposes.Nr;   rA   r   r   r   s       r,   make_empty_intermediate_tensorsz*SupportsPP.make_empty_intermediate_tensorsS  r   r.   intermediate_tensorsNc                    dS )z
        Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
        PP rank > 0.

        Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only
        for the last PP rank.
        Nr;   rA   r   s     r,   forwardzSupportsPP.forward\  s	     	r.   )rQ   r   r   r   r   r   r   r   r   r   r   r   r&   r   r   r;   r.   r,   r   r   F  s         OO+/K'$-(/// { 	
 
    2D8 
t	#	     r.   r   c                   n    e Zd ZU ed         ed<   dedej        dej        de	fdZ
de	d	z  dee	z  fd
Zd	S )_SupportsPPTypeTr   r   r   r   r)   c                     d S rs   r;   r   s       r,   r   z/_SupportsPPType.make_empty_intermediate_tensorsq  s	    
 "cr.   r   Nc                    d S rs   r;   r   s     r,   r   z_SupportsPPType.forwardx  s	     (+sr.   )rQ   r   r   r   r   r   r   r   r   r&   r   r   r   r;   r.   r,   r   r   m  s         "" {" 	"
 
" " " "+ 2D8+ 
%	%	+ + + + + +r.   r   c                     d S rs   r;   r   s    r,   r   r     s    BE#r.   c                     d S rs   r;   r   s    r,   r   r     s    69cr.   c                 ^    t                     }t                     }|r|st                              d            |sjd}t	           fd|D                       }t           dd          r|rt                              d |           n|st                              d            |o|S )NzjThe model (%s) sets `supports_pp=True`, but does not accept `intermediate_tensors` in its `forward` method)r   c              3   <   K   | ]}t          |          |V  d S rs   r   r   s     r,   rq   zsupports_pp.<locals>.<genexpr>  s2      TTtwud?S?STdTTTTTTr.   r   FzQThe model (%s) sets `supports_pp=True`, but is missing PP-specific attributes: %szXThe model (%s) contains all PP-specific attributes, but does not set `supports_pp=True`.)_supports_pp_attributes_supports_pp_inspectr   r   r   rL   )r   supports_attributessupports_inspectpp_attrsr   s   `    r,   r   r     s     2%88+E22 
#3 
=	
 	
 	
  7TTTTxTTTTT5-// 	 @!	   ! ;   3#33r.   c                     t          | t                    rt          | t                    S t          | t                    S rs   )ru   rP   r   r   r   s    r,   r   r     s4    % 2%111eZ(((r.   c                 f    t          | dd           }t          |          sdS t          |d          S )Nr   Fr   )rL   callabler   )r   model_forwards     r,   r   r     s9    E9d33MM"" u}&<===r.   c                   :    e Zd ZU dZdZeed                  ed<   dS )HasInnerStatez;The interface required for all models that has inner state.Thas_inner_stateN)rQ   r   r   r   r  r   r   r   r;   r.   r,   r  r    s5         EE/3OXgdm,333 r.   r  c                     d S rs   r;   r   s    r,   r  r    s    =@Sr.   c                     d S rs   r;   r   s    r,   r  r    s    ILr.   c                 $    t          | dd          S )Nr  Fr   r   s    r,   r  r    s     5+U333r.   c                   :    e Zd ZU dZdZeed                  ed<   dS )IsAttentionFreezThe interface required for all models like Mamba that lack attention,
    but do have state whose size is constant wrt the number of tokens.Tis_attention_freeN)rQ   r   r   r   r  r   r   r   r;   r.   r,   r  r    s>         J J 26x.555 r.   r  c                     d S rs   r;   r   s    r,   r  r    s    ADr.   c                     d S rs   r;   r   s    r,   r  r    r   r.   c                 $    t          | dd          S )Nr  Fr   r   s    r,   r  r    s     5-u555r.   c            
           e Zd ZU dZdZeed                  ed<   	 ede	de
e
eef         e
eeef         f         fd            Zede
edf         fd            Zd	S )
IsHybridzThe interface required for all models like Jamba that have both
    attention and mamba blocks, indicates that
    hf_config has 'layers_block_type'T	is_hybridrW   r)   c                     dS )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        Nr;   )r<   rW   s     r,   !get_mamba_state_shape_from_configz*IsHybrid.get_mamba_state_shape_from_config  s	     	r.   .c                     dS )a  Calculate copy-function callables for each Mamba state.

        Returns:
            A tuple of MambaStateCopyFunc callables that correspond, in order,
            to the Mamba states produced by the model. Each callable accepts
            (state, block_ids, cur_block_idx, num_accepted_tokens) and returns
            a MambaCopySpec describing the memory-copy parameters for prefix
            caching in align mode.
        Nr;   r<   s    r,   get_mamba_state_copy_funcz"IsHybrid.get_mamba_state_copy_func  s	     	r.   N)rQ   r   r   r   r  r   r   r   r   r"   r   r   r  r   r  r;   r.   r,   r  r    s         ) ) *.Ix&--- 
  
uS#Xc3m 44	5   [  
%0BC0G*H 
 
 
 [
 
 
r.   r  c                     d S rs   r;   r   s    r,   r  r        25#r.   c                     d S rs   r;   r   s    r,   r  r        >Acr.   c                 $    t          | dd          S )Nr  Fr   r   s    r,   r  r  "       5+u---r.   c                       e Zd ZU dZeee                  ed<   	 eed<   	 eed<   	 eed<   	 eed<   	 eed<   	 eed<   	 eed	<   	 eed
<   	 ee	j
                 ed<   	 dedededdfdZdededdfdZdS )MixtureOfExpertszA
    Check if the model is a mixture of experts (MoE) model.
    expert_weightsnum_moe_layersnum_expert_groupsnum_logical_expertsnum_physical_expertsnum_local_physical_expertsnum_routed_expertsnum_shared_expertsnum_redundant_experts
moe_layersexpert_load_viewlogical_to_physical_maplogical_replica_countr)   Nc                     t          | j                  D ]J\  }}| j                            |                                           |                    ||||           KdS )a  
        Register the EPLB state in the MoE model.

        Since these are views of the actual EPLB state, any changes made by
        the EPLB algorithm are automatically reflected in the model's behavior
        without requiring additional method calls to set new states.

        You should also collect model's `expert_weights` here instead of in
        the weight loader, since after initial weight loading, further
        processing like quantization may be applied to the weights.

        Args:
            expert_load_view: A view of the expert load metrics tensor.
            logical_to_physical_map: Mapping from logical to physical experts.
            logical_replica_count: Count of replicas for each logical expert.
        )moe_layer_idxr,  r-  r.  N)	enumerater+  r"  appendget_expert_weightsset_eplb_state)rA   r,  r-  r.  	layer_idxlayers         r,   r4  zMixtureOfExperts.set_eplb_stateQ  s}    , !*$/ : : 	 	Iu&&u'?'?'A'ABBB  '!1(?&;	 !    	 	r.   c                     d S rs   r;   )rA   r&  r'  s      r,    update_physical_experts_metadataz1MixtureOfExperts.update_physical_experts_metadataq  s	     sr.   )rQ   r   r   r   r   r   r   r   r   r   r   r4  r8  r;   r.   r,   r!  r!  (  s8          $HV$45555 -023 ####9114####+  "(  &	
 
   @! %( 
	     r.   r!  c                 V    t          | t                    ot          | dd          dk    S )Nr#  r   )ru   r!  rL   r   s    r,   is_mixture_of_expertsr:  x  s.    5*++W?OQR0S0SVW0Wr.   c                   6    e Zd ZU dZeed                  ed<   dS )HasNoOpsT	has_noopsN)rQ   r   r   r=  r   r   r   r;   r.   r,   r<  r<  ~  s+         )-Ix&-----r.   r<  c                     d S rs   r;   r   s    r,   r=  r=    r  r.   c                     d S rs   r;   r   s    r,   r=  r=    r  r.   c                 $    t          | dd          S )Nr=  Fr   r   s    r,   r=  r=    r  r.   c                   :    e Zd ZU dZdZeed                  ed<   dS )SupportsMambaPrefixCachingzmThe interface for models whose mamba layers support prefix caching.

    This is currently experimental.
    Tsupports_mamba_prefix_cachingN)rQ   r   r   r   rC  r   r   r   r;   r.   r,   rB  rB    s9          
 >B!8GDM#:AAAAAr.   rB  c                     d S rs   r;   r   s    r,   rC  rC    s	     *-r.   c                     d S rs   r;   r   s    r,   rC  rC    s	     03sr.   c                 $    t          | dd          S )NrC  Fr   r   s    r,   rC  rC    s     595AAAr.   c                   :    e Zd ZU dZdZeed                  ed<   dS )SupportsCrossEncodingzBThe interface required for all models that support cross encoding.Tsupports_cross_encodingN)rQ   r   r   r   rI  r   r   r   r;   r.   r,   rH  rH    s2         LL7;Xgdm4;;;;;r.   rH  c                     d S rs   r;   r   s    r,   rI  rI    r   r.   c                     d S rs   r;   r   s    r,   rI  rI    r   r.   c                 $    t          | dd          S )NrI  Fr   r   s    r,   _supports_cross_encodingrM    r   r.   c                 >    t          |           ot          |           S rs   )r    rM  r   s    r,   rI  rI    s      E""F'?'F'FFr.   c                        e Zd ZU dZdZeedz           ed<   dZee	e
ee
         f         dz           ed<   dZedz  ed<   def fdZededz  fd            Z xZS )	SupportsQuantz@The interface required for all models that support quantization.Nhf_to_vllm_mapperr   quant_configr)   c                    t                                          |           } | j        |i |}|U||_        |j        x}|j                            |           |j        $|j        j                            |j                   |S rs   )super__new___find_quant_configrR  rQ  apply_vllm_mapperr   update)r<   argsr?   instancerR  rQ  	__class__s         r,   rU  zSupportsQuant.__new__  s    77??3'' .s-t>v>>#$0H! &.%??!L%778IJJJ.:%<CC3   r.   c                      ddl m} t          |           t          |                                          z   }|D ]4}t	          ||          r	|j        c S t	          |t                    r|c S 5dS )z7Find quant config passed through model constructor argsr   r!   N)vllm.configr"   r   valuesru   rR  r   )rY  r?   r"   args_valuesargs        r,   rV  z SupportsQuant._find_quant_config  s     	+*****4jj4#8#88 	 	C#z** (''''#122 


 tr.   )rQ   r   r   r   rQ  r   r#   r   r   r   rv   r   rR  r   r   rU  staticmethodrV  __classcell__r[  s   @r,   rP  rP    s         JJ8<x 45<<<DHHT#tCy.%9D%@AHHH.2L$t+222      & /AD/H    \    r.   rP  c                       e Zd ZU dZeeeef                  ed<   dZee	d                  ed<   dZ
ee         ed<   	 dZee         ed<   	  fdZed	ej        d
edededz  de	d         dededz  defd            Zedeeef         fd            Zededz  dedz  fd            Zedede	d         defd            Zeded
edededz  fd            Z xZS )SupportsTranscriptionzAThe interface required for all models that support transcription.supported_languagesTsupports_transcriptionFsupports_transcription_onlysupports_segment_timestampc           
      6    t                      j        di | t          | j                  t          t	          j                              z
  }|rGt          | j         dt          |           dt          t	          j                                         d S )Nz6.supported_languages contains invalid language codes: z
. Valid choices are: r;   )	rT  __init_subclass__r   rf  r   keysr+   rQ   sorted)r<   r?   invalidr[  s      r,   rk  z'SupportsTranscription.__init_subclass__  s    !!++F+++ c-..Y^5E5E1F1FF 	< A A#)'??A A&,Y^-=-=&>&>A A  	 	r.   audio
stt_configra   languageN	task_type)
transcribe	translaterequest_promptto_languager)   c                     dS )zGet the prompt for the ASR model.
        The model has control over the construction, as long as it
        returns a valid PromptType.Nr;   )r<   ro  rp  ra   rq  rr  ru  rv  s           r,   get_generation_promptz+SupportsTranscription.get_generation_prompt  s	     	r.   c                 B      fdt          j                    D             S )Nc                 .    i | ]\  }}|j         v||S r;   )rf  )rG   kvr<   s      r,   
<dictcomp>z=SupportsTranscription.get_other_languages.<locals>.<dictcomp>&  s,    WWWAas?V6V6V16V6V6Vr.   )r   r~   r  s   `r,   get_other_languagesz)SupportsTranscription.get_other_languages#  s(     XWWW!2!2WWWWr.   c           	      L   |	|| j         v r|S ||                                 v rHt                              d|| j        t          | j                                                              |S t          d|dt          | j                                                    d          )z
        Ensure the language specified in the transcription request
        is a valid ISO 639-1 language code. If the request language is
        valid, but not natively supported by the model, trigger a
        warning (but not an exception).
        NzbLanguage %r is not natively supported by %s; results may be less accurate. Supported languages: %rzUnsupported language: z.  Must be one of rE   )rf  r~  r   r   rQ   r   rl  r+   )r<   rq  s     r,   validate_languagez'SupportsTranscription.validate_language(  s     x3+BBBO002222NNHS,113344   O; ; ;/446677; ; ;  r.   c                     dS )z0Get the speech to text config for the ASR model.Nr;   )r<   ra   rr  s      r,   get_speech_to_text_configz/SupportsTranscription.get_speech_to_text_configA  r>   r.   audio_duration_sc                     dS )z
        Map from audio duration to number of audio tokens produced by the ASR
        model, without running a forward pass.
        This is used for estimating the amount of processing for this audio.
        Nr;   )r<   r  rp  ra   s       r,   get_num_audio_tokensz*SupportsTranscription.get_num_audio_tokensH  s	     tr.   )rQ   r   r   r   r   r   rv   r   rg  r   rh  r   ri  rk  r   npndarrayr   r   r   rx  r~  r  r  floatr   r  rb  rc  s   @r,   re  re    s        KK "'#s("344446:HWT]3:::27$777 27666
 
 
 
 
 z ' "	
 * 45  4Z 
   [ XGCH$5 X X X [X t d
    [0 &3:;T3U	   [  ' "	
 
t   [    r.   re  c                     d S rs   r;   r   s    r,   rg  rg  W  r   r.   c                     d S rs   r;   r   s    r,   rg  rg  ]  s    LOCr.   c                 $    t          | dd          S )Nrg  Fr   r   s    r,   rg  rg  a  s     52E:::r.   c                   2    e Zd ZU dZdZeed<   	 dZeed<   dS )SupportsEagleBasezHBase interface for models that support EAGLE-based speculative decoding.Fhas_own_lm_headhas_own_embed_tokensN)rQ   r   r   r   r  r   r   r  r;   r.   r,   r  r  g  sD         RR!OT!!! "'$&&& r.   r  c                     d S rs   r;   r   s    r,   supports_any_eagler  v  s    PSPSr.   c                     d S rs   r;   r   s    r,   r  r  z  s    DGCr.   c                 >    t          |           pt          |           S )z7Check if model supports any EAGLE variant (1, 2, or 3).)supports_eaglesupports_eagle3r   s    r,   r  r  ~  s     %  :OE$:$::r.   c                   :    e Zd ZU dZdZeed                  ed<   dS )SupportsEaglez\The interface required for models that support
    EAGLE-1 and EAGLE-2 speculative decoding.Tr  N)rQ   r   r   r   r  r   r   r   r;   r.   r,   r  r    s;         1 1 /3NHWT]+222 r.   r  c                     d S rs   r;   r   s    r,   r  r        HKr.   c                     d S rs   r;   r   s    r,   r  r    r   r.   c                 ,    t          | t                    S rs   )ru   r  r   s    r,   r  r         e]+++r.   c                   x    e Zd ZU dZdZeed                  ed<   	 dee	df         ddfdZ
dee	df         fd	ZdS )
SupportsEagle3zPThe interface required for models that support
    EAGLE-3 speculative decoding.Tr  layers.r)   Nc                     dS )z
        Set which layers should output auxiliary
        hidden states for EAGLE-3.

        Args:
            layers: Tuple of layer indices that should output auxiliary
                hidden states.
        Nr;   )rA   r  s     r,   set_aux_hidden_state_layersz*SupportsEagle3.set_aux_hidden_state_layers  s	     	r.   c                     dS )z
        Get the layer indices that should output auxiliary hidden states
        for EAGLE-3.

        Returns:
            Tuple of layer indices for auxiliary hidden state outputs.
        Nr;   )rA   s    r,   "get_eagle3_aux_hidden_state_layersz1SupportsEagle3.get_eagle3_aux_hidden_state_layers  s	     	r.   )rQ   r   r   r   r  r   r   r   r   r   r  r  r;   r.   r,   r  r    s         % % 04OXgdm,333	%S/ 	d 	 	 	 	E#s(O      r.   r  c                     d S rs   r;   r   s    r,   r  r        JM#r.   c                     d S rs   r;   r   s    r,   r  r    r  r.   c                 ,    t          | t                    S rs   )ru   r  r   s    r,   r  r         e^,,,r.   c                       e Zd ZU dZdZeed                  ed<   	 dee	         ded         de
ej        e	f         fdZd	S )
SupportsMRoPEz:The interface required for all models that support M-RoPE.Tsupports_mropeinput_tokensmm_featuresr$   r)   c                     dS )aX  
        Get M-RoPE input positions and delta value for this specific model.

        This method should be implemented by each model that supports M-RoPE
        to provide model-specific logic for computing input positions.

        Args:
            input_tokens: List of input token IDs
            mm_features: Information about each multi-modal data item

        Returns:
            Tuple of `(llm_positions, mrope_position_delta)`
            - llm_positions: Tensor of shape `[3, num_tokens]` with T/H/W positions
            - mrope_position_delta: Delta for position calculations
        Nr;   rA   r  r  s      r,   get_mrope_input_positionsz'SupportsMRoPE.get_mrope_input_positions  s	    ( 	r.   N)rQ   r   r   r   r  r   r   r   r   r   r   r   r   r  r;   r.   r,   r  r    s         DD.2NHWT]+2223i 12 
u|S 	!	     r.   r  c                     d S rs   r;   r   s    r,   r  r    r  r.   c                     d S rs   r;   r   s    r,   r  r    r   r.   c                 ,    t          | t                    S rs   )ru   r  r   s    r,   r  r    r  r.   c                   r    e Zd ZU dZdZeed                  ed<   	 dee	         ded         de
j        fdZd	S )
SupportsXDRoPEz;The interface required for all models that support XD-RoPE.Tsupports_xdroper  r  r$   r)   c                     dS )a   
        Get XD-RoPE input positions and delta value for this specific model.

        This method should be implemented by each model that supports XD-RoPE
        to provide model-specific logic for computing input positions.

        Args:
            input_tokens: List of input token IDs
            mm_features: Information about each multi-modal data item

        Returns:
            llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
            4D(P/W/H/T) or 3D(W/H/T) positions.
        Nr;   r  s      r,   get_xdrope_input_positionsz)SupportsXDRoPE.get_xdrope_input_positions  s	    & 	r.   N)rQ   r   r   r   r  r   r   r   r   r   r   r   r  r;   r.   r,   r  r    sv         EE/3OXgdm,3333i 12 
	     r.   r  c                     d S rs   r;   r   s    r,   r  r  +  r  r.   c                     d S rs   r;   r   s    r,   r  r  /  r  r.   c                 ,    t          | t                    S rs   )ru   r  r   s    r,   r  r  3  r  r.   )tcollections.abcr   r   r   r   
contextlibr   r   r	   typingr
   r   r   r   r   r   r   numpyr  r   torch.nnr   r   0transformers.models.whisper.tokenization_whisperr   typing_extensionsr   r   r]  r   r   vllm.inputsr   vllm.inputs.datar   vllm.loggerr   ,vllm.model_executor.layers.mamba.mamba_utilsr   'vllm.model_executor.layers.quantizationr   vllm.utils.collection_utilsr   vllm.utils.func_utilsr   interfaces_baser   r    r"    vllm.model_executor.models.utilsr#   vllm.multimodal.inputsr$   vllm.multimodal.registryr%   vllm.sequencer&   r   rQ   r   r   r   r'   r   r-   r   r   rK   r0   r   rP   r1   r   r2   r4   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r!  r:  r<  r=  rB  rC  rH  rI  rM  rP  re  rg  r  r  r  r  r  r  r  r  r  r  r;   r.   r,   <module>r     sv   I H H H H H H H H H H H H = = = = = = = = = =                                   F F F F F F * * * * * * * * 7 7 7 7 7 7 7 7 $ $ $ $ $ $ ' ' ' ' ' ' # # # # # # K K K K K K F F F F F F 5 5 5 5 5 5 - - - - - - 8 8 8 8 8 8 8 8 !&&&&&&>>>>>><<<<<<<<<<<<1111111JM"  	X		"&v,"7%:L"L i L L L&4- F    " 7DI!5688  j
 j
 j
 j
 j
 j
 j
 j
Z	 # # # # # # # #L 
 UtF| Ut<N7O0P U U U 
 U 
 Iv I&1C*D I I I 
 I8<& 8D#$%/A(BB8 8 8 8Gd6lV.C G G G G G>T&\F%: >t > > > >=tF|f/D = = = = = 
2<2D*+,2 2 2 
2
 
 Xv X&9R2S X X X 
 X@<& @D*+,v6O/PP@ @ @ @     H   6 
.<.D&'(. . . 
.
 
 P6 Pf5J.K P P P 
 P<<& <D&'(62G+HH< < < < 6 6 6 6 68 6 6 6* & & & & & & & & 
 If I&l1C*D I I I 
 I 
 = =F<$8 = = = 
 =<& D&"66   >+$v,/ +D + + + + ! ! ! ! ! ! ! !L + + + + +h + + +" 
 EtF| EtJ/?(@ E E E 
 E 
 9v 9&"4 9 9 9 
 9!4<& !4	F4
#$$vj'99!4 !4 !4 !4H)4<&#8 )T ) ) ) )>Vv 5 >$ > > > >     H    
 @6 @f]&; @ @ @ 
 @ 
 L4< LF43F,G L L L 
 L4<& 4D 6-#884 4 4 4 	 	 	 	 	h 	 	 	 
 DV D(? D D D 
 D 
 PT&\ PfT/5J.K P P P 
 P6<& 6D!"VO%<<6 6 6 6 ' ' ' ' 'x ' ' 'T 
 5V 5x 0 5 5 5 
 5 
 AT&\ AfT(^&< A A A 
 A.<& .DNfX... . . . L L L L Lx L L L^ F3C,D     . . . . .x . . . 
 5V 5x 0 5 5 5 
 5 
 AT&\ AfT(^&< A A A 
 A.<& .DNfX... . . . B B B B B B B B 
--&'- - - 
-
 
3<3D+,-3 3 3 
3
B<& BD+,-7Q0RRB B B B < < < < <H < < < 
.<.D&'(. . . 
.
 
 P6 Pf5J.K P P P 
 P<<& <D&'(62G+HH< < < <G<& GD&'(62G+HHG G G G' ' ' ' ' ' ' 'T _ _ _ _ _H _ _ _D 
.<.D&'(. . . 
.
 
 O& OV4I-J O O O 
 O;<& ;D&'(62G+HH; ; ; ;         
 Sd6l Svd;L6M/N S S S 
 S 
 Gf G0A)B G G G 
 G;<& ;D"#$v.?'@@; ; ; ;     %x    
 K$v, K6$}2E+F K K K 
 K 
 ?& ?VM%: ? ? ? 
 ?,<& ,D 6-#88, , , , ! ! ! ! !& ! ! !H 
 M4< MF43G,H M M M 
 M 
 A6 Af^&< A A A 
 A-<& -D !F>$::- - - -          H      F 
 K$v, K6$}2E+F K K K 
 K 
 ?& ?VM%: ? ? ? 
 ?,<& ,D 6-#88, , , ,     X   D 
 M4< MF43G,H M M M 
 M 
 A6 Af^&< A A A 
 A-<& -D !F>$::- - - - - -r.   