
    .`iY              	       p   U d dl mZmZ d dlmZ d dlmZmZmZm	Z	 d dl
mZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d	d
lmZmZmZ d	dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z,  ed          Z- ed          Z.erd dl/m0Z1 n ed e2            d          Z1 G d deee-e.f                   Z3 G d de3e	e-         e-f                   Z4	 d5dej5        de6de7dz  ddfdZ8 G d de3ej5        e9ej5                 z  ej5        f                   Z: G d de3ee6ej5        f         ee6ej5        f         f                   Z; G d de4e"                   Z< G d  d!e:          Z= G d" d#e          Z> G d$ d%e4e#                   Z? G d& d'e:          Z@ G d( d)e4e$                   ZA G d* d+e:          ZB G d, d-e4e                   ZC ed.e3eef         /          ZD G d0 d1ee6e3eef         f                   ZEee&e         ge3eef         dz  f         ZFeeGd2<    G d3 d4          ZHdS )6    )ABCabstractmethod)UserDict)CallableIteratorMappingSequence)TYPE_CHECKINGAnyGenericLiteral
NamedTuple	TypeAlias	TypeGuardTypeVarN)assert_never)
is_list_of)
LazyLoader   )AudioResampler	AudioSpecnormalize_audio)
	AudioItemHfAudioItemHfImageItemHfVideoItem	ImageItemModalityDataMultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems	VideoItem)MediaWithBytes_T_IPILImagez	PIL.Imagec                   H    e Zd ZdZdededdf fdZdefdZdefdZ	d	ede
fd
Zerdee
         fdZedefd            Zed	ede
fd            Zdee
         fdZd	edefdZdee         fdZedeeef         fd            Zedeeef         fd            Z xZS )ModalityDataItemszy
    Represents data items for a modality in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    datamodalityreturnNc                 d    t                                                       || _        || _        d S N)super__init__r)   r*   )selfr)   r*   	__class__s      i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/multimodal/parse.pyr/   zModalityDataItems.__init__7   s+    	     c                 `    t          |           j         d| j        dt          |            dS )Nz
(modality=z, len=))type__name__r*   lenr0   s    r2   __repr__zModalityDataItems.__repr__=   s2    t**%TTTTD		TTTTr3   c                 *    |                                  S r-   	get_countr9   s    r2   __len__zModalityDataItems.__len__@   s    ~~r3   indexc                 ,    |                      |          S r-   getr0   r?   s     r2   __getitem__zModalityDataItems.__getitem__C       xxr3   c                     d S r-    r9   s    r2   __iter__zModalityDataItems.__iter__H   s      r3   c                     t           )zGet the number of data items.NotImplementedErrorr9   s    r2   r=   zModalityDataItems.get_countJ   
     "!r3   c                     t           )zGet a data item by its index.rJ   rC   s     r2   rB   zModalityDataItems.getO   rL   r3   c                 ^      fdt                                                     D             S )zGet all data items.c                 :    g | ]}                     |          S rG   rA   .0idxr0   s     r2   
<listcomp>z-ModalityDataItems.get_all.<locals>.<listcomp>V   s#    AAA#AAAr3   ranger=   r9   s   `r2   get_allzModalityDataItems.get_allT   s/    AAAAt~~/?/?)@)@AAAAr3   c                 ,    |                      |          S r-   rA   rC   s     r2   get_item_for_hashz#ModalityDataItems.get_item_for_hashX   rE   r3   c                 ^      fdt                                                     D             S )Nc                 :    g | ]}                     |          S rG   )rX   rP   s     r2   rS   z<ModalityDataItems.get_all_items_for_hash.<locals>.<listcomp>\   s'    OOO&&s++OOOr3   rT   r9   s   `r2   get_all_items_for_hashz(ModalityDataItems.get_all_items_for_hash[   s/    OOOOuT^^=M=M7N7NOOOOr3   c                     t           )z)Get the data to pass to the HF processor.rJ   r9   s    r2   get_processor_dataz$ModalityDataItems.get_processor_data^   rL   r3   c                     t           )z+Get the data to pass directly to the model.rJ   r9   s    r2   get_passthrough_dataz&ModalityDataItems.get_passthrough_datac   rL   r3   )r7   
__module____qualname____doc__r$   strr/   r:   intr>   r%   rD   r
   r   rH   r   r=   rB   listrV   objectrX   r[   r   r]   r_   __classcell__r1   s   @r2   r(   r(   1   s        
!R !3 !4 ! ! ! ! ! !U# U U U U                0/hrl////"3 " " " ^" " " " " " ^"Bb B B B Bs v    PV P P P P "GCK$8 " " " ^" "gc6k&: " " " ^" " " " "r3   r(   c                       e Zd ZdZdeee         z  defdZdefdZdedefdZ	dedeee         z  fdZ
deeef         fd	Zdeeef         fd
ZdS )ProcessorBatchItemsz6Base class for data items that are arranged in a list.itemr+   c                 >    t          |t                    r|j        n|S z&Extract media from wrapper if present.
isinstancer#   mediar0   rk   s     r2   _unwrapzProcessorBatchItems._unwrapl   s    'n==Gtzz4Gr3   c                 *    t          | j                  S r-   r8   r)   r9   s    r2   r=   zProcessorBatchItems.get_countp       49~~r3   r?   c                 B    |                      | j        |                   S r-   rr   r)   rC   s     r2   rB   zProcessorBatchItems.gets       ||DIe,---r3   c                     | j         |         S r-   r)   rC   s     r2   rX   z%ProcessorBatchItems.get_item_for_hashv   s    yr3   c                 >    | j          d|                                 iS )Ns)r*   rV   r9   s    r2   r]   z&ProcessorBatchItems.get_processor_dataz   s     =###T\\^^44r3   c                     i S r-   rG   r9   s    r2   r_   z(ProcessorBatchItems.get_passthrough_data}       	r3   N)r7   r`   ra   rb   r$   r#   rr   rd   r=   rB   rX   r   rc   rf   r]   r_   rG   r3   r2   rj   rj   i   s        @@HB!33 H H H H H3    . . . . . . s  rN24F/F        5GCK$8 5 5 5 5gc6k&:      r3   rj   tensorr*   r?   r+   c                     | j         dk     s| j         dk    rM|d| dnd}t          |                                 d| d| j          d	t          | j                             dS )
a  Validate tensor ndim for multimodal embeddings.

    Single embeddings should be 2D (seq_len, hidden_size).
    Batched embeddings should be 3D (batch, seq_len, hidden_size).

    Args:
        tensor: The tensor to validate.
        modality: The modality name for error messages (e.g., "image", "audio").
        index: Optional index for list items, included in error messages.
          Nz [] z
 embeddingzL must be 2D (seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), got D tensor with shape )ndim
ValueError
capitalizetupleshape)r   r*   r?   idx_strs       r2   validate_embedding_ndimr      s     {Q&+//#(#4-u----"""$$ J J J J;J J49&,4G4GJ J
 
 	
 */r3   c            	       *    e Zd ZdZ	 ddej        eej                 z  dededz  ddf fdZ	ddZ
deddfd	Zd
ej        eej                 z  dej        fdZdefdZdedej        fdZdeeef         fdZdeeef         fdZdedefdZ xZS )EmbeddingItemsz
    Base class for data items that are expressed as a batched embedding tensor,
    or a list of embedding tensors (one per item).
    Nr)   r*   expected_hidden_sizer+   c                     t                                          ||           |                                  ||                     |           d S d S r-   )r.   r/   _validate_ndim_validate_hidden_size)r0   r)   r*   r   r1   s       r2   r/   zEmbeddingItems.__init__   s^     	x((( 	  +&&';<<<<< ,+r3   c                 V   t          | j        t          j                  rt	          | j        | j                   dS t          | j                  D ]X\  }}|j        dk    rHt          | j        	                                 d| d|j         dt          |j                             YdS )z=Validate that embedding tensors have correct ndim (2D or 3D).r    embedding [z)] must be 2D (seq_len, hidden_size), got r   N)ro   r)   torchTensorr   r*   	enumerater   r   r   r   r   )r0   rR   r   s      r2   r   zEmbeddingItems._validate_ndim   s    di.. 
	#DIt}=====  )33  V;!##$=3355 < <3 < <:@+< <&+FL&9&9< <   $ r3   c                    t          | j        t          j                  rb| j        j        d         }||k    rHt          | j                                         d| d| dt          | j        j                             dS t          | j                  D ]^\  }}|j        d         }||k    rFt          | j                                         d| d| d| dt          |j                   	          _dS )a8  Validate that embedding hidden dimension matches expected size.

        This validates hidden dimensions to prevent vulnerabilities: Embeddings
        with correct ndim but wrong hidden dimension could bypass initial
        checks and cause crashes during model inference when dimensions don't match.
        z* embedding hidden dimension mismatch: got z, but model expects z. Embedding shape: r   z!] hidden dimension mismatch: got N)
ro   r)   r   r   r   r   r*   r   r   r   )r0   r   actual_hidden_sizerR   r   s        r2   r   z$EmbeddingItems._validate_hidden_size   sb    di.. 	!%!4!%999 }//11 Y Y%7Y Y+Y Y@Edio@V@VY Y   :9  )33  V%+\"%5"%)===$=3355 B B3 B B3EB B#7B B -2&,,?,?B B   > r3   rk   c                 >    t          |t                    r|j        n|S rm   rn   rq   s     r2   rr   zEmbeddingItems._unwrap   s     (n==Gtzz4Gr3   c                 *    t          | j                  S r-   rt   r9   s    r2   r=   zEmbeddingItems.get_count   ru   r3   r?   c                 B    |                      | j        |                   S r-   rw   rC   s     r2   rB   zEmbeddingItems.get   rx   r3   c                     i S r-   rG   r9   s    r2   r]   z!EmbeddingItems.get_processor_data   r~   r3   c                 $    | j          d| j        iS )N_embeds)r*   r)   r9   s    r2   r_   z#EmbeddingItems.get_passthrough_data   s    =)))4955r3   item_idxc                 F    t          |                     |                    S r-   r8   rB   r0   r   s     r2   get_feature_sizezEmbeddingItems.get_feature_size       488H%%&&&r3   r-   )r+   N)r7   r`   ra   rb   r   r   re   rc   rd   r/   r   r   r#   rr   r=   rB   r   rf   r]   r_   r   rg   rh   s   @r2   r   r      s         ,0	= =lT%,//= = "Dj	=
 
= = = = = =   # $    8HL>%,#??H	H H H H3    . . . . . .GCK$8    6gc6k&: 6 6 6 6' ' ' ' ' ' ' ' ' 'r3   r   c                       e Zd ZdZdeeej        f         dedee         de	eeej        f         geee
f         f         ddf
 fdZdefd	Zd
edeeej        f         fdZdeeef         fdZdeeef         fdZ xZS )DictEmbeddingItemsz
    Base class for data items that are expressed as a dictionary of tensors.

    Usually, the dictionary keys correspond to the outputs of HF processor.
    r)   r*   required_fieldsfields_factoryr+   Nc                 .   ddl m} t                                          ||           ||                                z
  }|r8t          |                                          }d| d| }t          |           ||          }	||	                                z
  }
|
r8t          |	                                          }d|d|}t          |          |	| _        || _        t          j
         |t          |                    |	          | _        d S )Nr   )BatchFeaturez$The data should contain the fields: z%, but only found the following keys: zrequired_fields=z should be a subset of fields=)%transformers.feature_extraction_utilsr   r.   r/   keyssetr   fields_configr   r!   from_hf_inputsdict_kwargs)r0   r)   r*   r   r   r   missing_required_data_keys	data_keysmsgr   missing_required_fieldsfieldsr1   s               r2   r/   zDictEmbeddingItems.__init__   s>    	GFFFFFx(((%4tyy{{%B"% 	"DIIKK((IB B B6?B B  S//!&t,,"1M4F4F4H4H"H" 	"++--..FG_GGfGGCS//!*.,;Ld$$
 
r3   c                 @    t          | j        | j                           S r-   )r8   r   r*   r9   s    r2   r=   zDictEmbeddingItems.get_count  s    4<.///r3   r?   c                 V    | j         | j                 |                                         S r-   )r   r*   get_datarC   s     r2   rB   zDictEmbeddingItems.get"  s"    |DM*51::<<<r3   c                     i S r-   rG   r9   s    r2   r]   z%DictEmbeddingItems.get_processor_data%  r~   r3   c                     | j         S r-   rz   r9   s    r2   r_   z'DictEmbeddingItems.get_passthrough_data(  s
    yr3   )r7   r`   ra   rb   r   rc   r   r   r   r   r    r/   rd   r=   rB   rf   r]   r_   rg   rh   s   @r2   r   r      sC        $
c5<'($
 $
 S	$

 !S%,&'(C../1
$
 
$
 $
 $
 $
 $
 $
L03 0 0 0 0= =el):!; = = = =GCK$8    gc6k&:        r3   r   c                   J     e Zd Zdee         dz  ddf fdZdedefdZ xZS )AudioProcessorItemsr)   Nr+   c                 V    |d g}t                                          |d           d S Naudior.   r/   r0   r)   r1   s     r2   r/   zAudioProcessorItems.__init__-  /    <6Dw'''''r3   r   c                 J    |                      |          }t          |          S r-   )rB   r8   )r0   r   r   s      r2   get_audio_lengthz$AudioProcessorItems.get_audio_length2  s    ""5zzr3   )	r7   r`   ra   r	   r   r/   rd   r   rg   rh   s   @r2   r   r   ,  sw        (Xk2T9 (d ( ( ( ( ( (
         r3   r   c                   \     e Zd Z	 ddej        eej                 z  dedz  ddf fdZ xZS )AudioEmbeddingItemsNr)   r   r+   c                 N    t                                          |d|           d S r   r   r0   r)   r   r1   s      r2   r/   zAudioEmbeddingItems.__init__8  (    
 	w(<=====r3   r-   	r7   r`   ra   r   r   re   rd   r/   rg   rh   s   @r2   r   r   7  u         ,0> >lT%,//> "Dj> 
	> > > > > > > > > >r3   r   c                   $    e Zd ZU eed<   eed<   dS )	ImageSizewidthheightN)r7   r`   ra   rd   __annotations__rG   r3   r2   r   r   @  s"         JJJKKKKKr3   r   c                   J     e Zd Zdee         dz  ddf fdZdedefdZ xZ	S )ImageProcessorItemsr)   Nr+   c                 V    |d g}t                                          |d           d S Nimager   r   s     r2   r/   zImageProcessorItems.__init__F  r   r3   r   c                     |                      |          }t          |t          j                  rt	          |j         S t          |t          j        t          j	        f          r|j
        \  }}}t	          ||          S t          |           d S r-   rB   ro   r&   Imager   sizenpndarrayr   r   r   r   r0   r   r   _hws         r2   get_image_sizez"ImageProcessorItems.get_image_sizeK  s~    ""eX^,, 	*ej))ebj%,788 	#kGAq!Q??"Ur3   )
r7   r`   ra   r	   r   r/   rd   r   r   rg   rh   s   @r2   r   r   E  sw        (Xk2T9 (d ( ( ( ( ( (
	s 	y 	 	 	 	 	 	 	 	r3   r   c                   \     e Zd Z	 ddej        eej                 z  dedz  ddf fdZ xZS )ImageEmbeddingItemsNr)   r   r+   c                 N    t                                          |d|           d S r   r   r   s      r2   r/   zImageEmbeddingItems.__init__X  r   r3   r-   r   rh   s   @r2   r   r   W  r   r3   r   c            	            e Zd Z	 d	dee         dz  deeef         eeeef         dz           z  dz  ddf fdZ	de
de
fdZde
defdZ xZS )
VideoProcessorItemsNr)   metadatar+   c                 d    |d g}t                                          |d           || _        d S Nvideo)r.   r/   r   )r0   r)   r   r1   s      r2   r/   zVideoProcessorItems.__init__a  s6    
 <6Dw''' r3   r   c                 F    t          |                     |                    S r-   r   r   s     r2   get_num_framesz"VideoProcessorItems.get_num_framesk  r   r3   c                 ,   |                      |          d         }t          |t          j                  rt	          |j         S t          |t          j        t          j	        f          r|j
        \  }}}t	          ||          S t          |           d S )Nr   r   r   s         r2   get_frame_sizez"VideoProcessorItems.get_frame_sizen  s    ""1%eX^,, 	*ej))ebj%,788 	#kGAq!Q??"Ur3   r-   )r7   r`   ra   r	   r   r   rc   r   re   r/   rd   r   r   r   rg   rh   s   @r2   r   r   `  s         IM! !{#d*! sCx.4S#X(=#>>E! 
	! ! ! ! ! !'s 's ' ' ' '	s 	y 	 	 	 	 	 	 	 	r3   r   c                   \     e Zd Z	 ddej        eej                 z  dedz  ddf fdZ xZS )VideoEmbeddingItemsNr)   r   r+   c                 N    t                                          |d|           d S r   r   r   s      r2   r/   zVideoEmbeddingItems.__init__{  r   r3   r-   r   rh   s   @r2   r   r   z  r   r3   r   c                   8     e Zd ZdZdee         ddf fdZ xZS )VisionChunkProcessorItemszCProcessor items for vision chunks (unified image and video chunks).r)   r+   Nc                 L    t                                          |d           d S )Nvision_chunkr   r   s     r2   r/   z"VisionChunkProcessorItems.__init__  s#    ~.....r3   )r7   r`   ra   rb   r	   r   r/   rg   rh   s   @r2   r   r     sU        MM/Xc] /t / / / / / / / / / /r3   r   _D)boundc                       e Zd ZdZdddededefdZdeeef         fdZ	ded	e
e         ee
e         d
f         z  defdZdS )MultiModalDataItemsz
    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
    normalized such that each entry corresponds to a list.
    T)strictr*   r   r+   c                    || vr:|r6t          |                                           }t          d|d|           dS | |                                         S )z
        Get the number of data items belonging to a modality.

        If `strict=False`, return `0` instead of raising [`KeyError`][]
        even if the modality is not found.
        	Modality " not found. Available modalities: r   )r   r   KeyErrorr=   )r0   r*   r   available_modalitiess       r2   r=   zMultiModalDataItems.get_count  s     4 '*499;;'7'7$D D D-AD D  
 1H~'')))r3   c                 >    d |                                  D             S )z3Get the number of items belonging to each modality.c                 >    i | ]\  }}||                                 S rG   r<   )rQ   mitemss      r2   
<dictcomp>z6MultiModalDataItems.get_all_counts.<locals>.<dictcomp>  s(    BBBE5??$$BBBr3   )r  r9   s    r2   get_all_countsz"MultiModalDataItems.get_all_counts  s    BBTZZ\\BBBBr3   typ.c           
          || vr6t          |                                           }t          d|d|           | |         }t          ||          s%t	          d|d| dt          |                     |S )zs
        Get the data items belonging to a modality,
        requiring that they belong to a certain type.
        r   r   z(Invalid type of data items for modality=z. Expected type: z, but found type: )r   r   r   ro   	TypeErrorr6   )r0   r*   r  r  r  s        r2   	get_itemszMultiModalDataItems.get_items  s     4#&tyy{{#3#3 @H @ @)=@ @  
 X%%% 	-( - -"%- -#E{{- -   r3   N)r7   r`   ra   rb   rc   boolrd   r=   r   r  r6   r   r   r  rG   r3   r2   r   r     s         
 :> * * *# *$ *# * * * *&CS 1 C C C C "Xd2hm,, 
	     r3   r   ModalityDataParserc                   :    e Zd ZdZdddddddedz  dedz  ded	         d
ededz  ddf fdZe	de
deej        eej                 z           fd            Zde
ded         fdZdedeej        edz  f         fdZdedeej        eeef         dz  f         fdZdee         deeef         dz  fdZdee         deeef         dz  fdZdee         deeef         dz  fdZ dee         deeef         dz  fdZ!de"ee#f         fdZ$de%de&fdZ' xZ(S )MultiModalDataParsera  
    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].

    Args:
        target_sr (float, optional): Enables automatic resampling of audio
            items to the model's expected sampling rate.
        target_channels (int, optional): Target number of audio channels.
            If provided, normalizes audio to this many channels (e.g., 1 for mono).
            If None, audio channels are passed through unchanged.
        expected_hidden_size (int, optional): Expected hidden dimension for
            embedding inputs. If provided, validates that user-supplied
            embeddings have the correct hidden size to prevent crashes
            during model inference.
    NlibrosaF)	target_srtarget_channelsaudio_resample_methodvideo_needs_metadatar   r  r  r  )r  scipyr  r   r+   c                    t                                                       t          ||          | _        || _        || _        || _        d S )N)r  method)r.   r/   r   audio_resamplerr  r  r   )r0   r  r  r  r  r   r1   s         r2   r/   zMultiModalDataParser.__init__  sX     	-( 
  
  
  /$8!$8!!!r3   r)   c                     t          |t          j                  r|j        dk    S t	          |t          j                  r|d         j        dk    S dS )Nr   r   r   F)ro   r   r   r   r   )clsr)   s     r2   is_embeddingsz"MultiModalDataParser.is_embeddings  sN     dEL)) 	"9>!dEL)) 	%7<1$$ur3   c                     t          |t                    rt          |          dk    S t          |t          j        t
          j        f          r|j        dk    S dS )Nr   F)ro   re   r8   r   r   r   r   r   r0   r)   s     r2   	_is_emptyzMultiModalDataParser._is_empty  sP    dD!! 	"t99>!dRZ677 	"9>!ur3   r   c                 D   t          |t                    r|S t          |t                    rt          j        |          d fS t          |t          j                  r|d fS t          |t          j                  r|                                d fS t          |           d S r-   
ro   r   re   r   arrayr   r   r   numpyr   )r0   r   s     r2   _get_audio_with_srz'MultiModalDataParser._get_audio_with_sr       eU## 	LeT"" 	)8E??D((eRZ(( 	$;eU\** 	';;==$&&Ur3   r   c                 D   t          |t                    r|S t          |t                    rt          j        |          d fS t          |t          j                  r|d fS t          |t          j                  r|                                d fS t          |           d S r-   r   )r0   r   s     r2   _get_video_with_metadataz-MultiModalDataParser._get_video_with_metadata  r$  r3   c                    |t          d           S |                     |          s0t          |t                    r|                     |d                   rd S |                     |          rt          || j                  S t          |t                    sFt          |t          j
        t          j        f          r|j        dk    st          |t                    r|g}n5t          |t          j
        t          j        f          rd |D             }n|}t          t          j
                             }|D ]|}|                     |          \  }}||}n| j                            ||          }| j        %t'          | j                  }t)          ||          }|                    |           }t          |          S )Nr   r   c                     g | ]}|S rG   rG   rQ   elems     r2   rS   z:MultiModalDataParser._parse_audio_data.<locals>.<listcomp>8      0004$000r3   )orig_sr)r  )r   r  ro   r   r  r   r   r   floatr   r   r   r   r   re   r#  r  resampler  r   r   append)	r0   r)   
data_items
new_audios	data_itemr   r,  	new_audiospecs	            r2   _parse_audio_dataz&MultiModalDataParser._parse_audio_data  s    <&t,,, >>$ 	tU##	(,tAw(?(?	 4d## 	H&tT-FGGG tU##
	$U\ :;;
	 	Q$&&  JJrz5<899 	004000JJJ"*%''
# 	) 	)I!44Y??NE7!		 099%9QQ	 #/ 1EFFF+It<<	i((((":...r3   c                    |t          d           S |                     |          rd S |                     |          rt          || j                  S t          |t          j        t          f          s1t          |t          j
        t          j        f          r|j        dk    r|g}n5t          |t          j
        t          j        f          rd |D             }n|}t          |          S )Nr   c                     g | ]}|S rG   rG   r)  s     r2   rS   z:MultiModalDataParser._parse_image_data.<locals>.<listcomp>a  r+  r3   )r   r  r  r   r   ro   r&   r   r#   r   r   r   r   r   )r0   r)   r0  s      r2   _parse_image_dataz&MultiModalDataParser._parse_image_dataM  s     <&t,,,>>$ 	4d## 	H&tT-FGGG thnn=>>		$U\ :;;		 	QJJrz5<899 	004000JJJ":...r3   c                    |t          d           S |                     |          rd S |                     |          rt          || j                  S t          |t          j                  s1t          |t          j
        t          j        f          r|j        dk    r|g}nat          |t          j
        t          j        f          rd |D             }n.t          |t                    rt          |          dk    r|g}n|}t!          t          t          j
        t"          t$          t&          f         d z  f                              }g }|D ]t}|                     |          \  }}| j        r>|t-          d          |                    ||f           |                    |           _|                    |           u| j        sd }t          ||          S )N   c                     g | ]}|S rG   rG   r)  s     r2   rS   z:MultiModalDataParser._parse_video_data.<locals>.<listcomp>|  r+  r3   r   ziVideo metadata is required but not found in mm input. Please check your video input in `multi_modal_data`)r   )r   r  r  r   r   r   r&   r   ro   r   r   r   r   r   r   r8   re   r   rc   r   r&  r  r   r/  )r0   r)   r0  
new_videosmetadata_lstr2  r   r   s           r2   _parse_video_dataz&MultiModalDataParser._parse_video_datag  s    <&t,,,>>$ 	4d## 	H&tT-FGGG tX^,,	$U\ :;;	 	QJJrz5<899 	004000JJe$$ 	TaJJJ%
DcNT,A ABCEE
46# 	) 	)I";;IFFOE8( 	)#$N   !!5("3444##H----!!%((((( 	H":EEEEr3   c                     ||                      |          rdS |                     |          rt          d          t          |          S )z9Parse vision chunk data (unified image and video chunks).Nz8Do not support embedding data for vision_chunk right now)r  r  r   r   r  s     r2   _parse_vision_chunk_dataz-MultiModalDataParser._parse_vision_chunk_data  sR    
 <4>>$//<4d## 	YWXXX(...r3   c                 8    | j         | j        | j        | j        dS )N)r   r   r   r   )r5  r8  r>  r@  r9   s    r2   _get_subparsersz$MultiModalDataParser._get_subparsers  s)    +++ 9	
 
 	
r3   mm_datac                     |                                  }t                      }|                                D ]3\  }}||vrt          d|            ||         |          x}|||<   4|S )NzUnsupported modality: )rB  r   r  r   )r0   rC  
subparsersmm_itemskvparsed_datas          r2   parse_mm_dataz"MultiModalDataParser.parse_mm_data  s    ))++
&((MMOO 	* 	*DAq
"" !=!!=!=>>>  -z!}Q///<)r3   ))r7   r`   ra   rb   r-  rd   r   r  r/   classmethodrf   r   r   r   re   r  r  r   r   r   r   r#  r"   r   rc   r   r&  r   r(   r5  r   r8  r>  r@  r   r  rB  r   r   rJ  rg   rh   s   @r2   r  r    s        & #'&*=F%*+/9 9 9 4<9 t	9
  ''9:9 #9 "Dj9 
9 9 9 9 9 9& 	5<$u|"44	5   [f 4     
rz54<'	(    
rz4S>D00	1   ,/9%,/ 
38	$t	+,/ ,/ ,/ ,/\/9%/ 
38	$t	+/ / / /4-F9%-F 
38	$t	+-F -F -F -F^	/3	/ 
38	$t	+	/ 	/ 	/ 	/
.@)@!A 
 
 
 
%7 <O        r3   r  r-   )Iabcr   r   collectionsr   collections.abcr   r   r   r	   typingr
   r   r   r   r   r   r   r   r"  r   r   typing_extensionsr   vllm.utils.collection_utilsr   vllm.utils.import_utilsr   r   r   r   r   inputsr   r   r   r   r   r   r   r    r!   r"   rp   r#   r$   r%   	PIL.Imager   r&   globalsr(   rj   r   rc   rd   r   re   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r  rG   r3   r2   <module>rV     s   $ # # # # # # # #             A A A A A A A A A A A A	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	      * * * * * * 2 2 2 2 2 2 . . . . . . = = = = = = = = = =                        " ! ! ! ! !WT]]WT]] >       z*ggii==H5" 5" 5" 5" 5"WRV_ 5" 5" 5"p    +HRL",<=   6 
 
L

 :
 
	
 
 
 
0T' T' T' T' T'elT%,%77EFT' T' T'n9 9 9 9 9gc5</0'#u|:K2LLM9 9 9x    -k:   > > > > >. > > >    
   
    -k:   $> > > > >. > > >    -k:   4> > > > >. > > >/ / / / / 3C 8 / / / WT*3845555 5 5 5 5(3(9#s((C#CD 5 5 5p !)#*384t;;! I   
k k k k k k k k k kr3   