
    .`i}                     ~   U d dl mZmZ d dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlZd d	lmZ d d
lmZm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' erd dl(Z(d dl)Z(d dl*m+Z+ ddl,m-Z- n e%d e.            d          Z( e d          Z/edej0        df         Z1ee2d<   	 ee3d         ej0        de3ej0                 e3d         f         Z4ee2d<   	 ee3e5         ej0        df         Z6ee2d<   	 ee1ddf         Z7ee2d<   	 ee4de8e4e9e:ef         f         f         Z;ee2d<   	 ee6e8ej0        e5f         df         Z<ee2d<   	 e/e3e/dz           z  dz  Z=ee2d<   	  G d de          Z> G d d e          Z?e>e?z  Z@	 e G d! d"ed#$                      ZAee:e=e         f         ZBee2d%<   	 ee:e3e:dz           e:z  f         ZCee2d&<   	  e
d'(           G d) d*                      ZDee3d+         e3d         de8d,         f         ZEee2d+<   	 d-eEd.eEd/eFfd0ZGd1eEd2e(jH        jI        d/eEfd3ZJe9e:eEf         ZKee2d4<   	 d-eKd.eKd/eFfd5ZLe
 G d6 d7                      ZMe
 G d8 d9                      ZN e
d'd':           G d; d<e                      ZO e
d'd':           G d= d>eO                      ZP e
d'd':           G d? d@eO                      ZQ e
d'd':           G dA dBeO                      ZR e
d'(           G dC dD                      ZS G dE dFee:eNf                   ZT e dGeTeTdz  eTH          ZU G dI dJee:eeU         f                   ZVeVeT         eVeTdz           z  ZWee2dK<   e9e:e3e:         f         ZX	 ee:eeD         f         ZYee2dL<   	  G dM dNe          ZZ G dO dPeZ          Z[dS )Q    )ABCabstractmethod)UserDictdefaultdict)MappingSequence)	dataclass)cached_propertypartial)
accumulate)	TYPE_CHECKINGAnyLiteralOptional	TypeAlias	TypedDictUnioncastfinalN)Image)NotRequiredTypeVar)full_groupby
is_list_of)
LazyLoaderjson_map_leaves)BatchFeature   )MediaWithBytestorch_Tr   torch.TensorHfImageItemHfVideoItemHfAudioItemzMediaWithBytes[HfImageItem]	ImageItem	VideoItem	AudioItemModalityDatac                   D    e Zd ZU dZed         ed<   eed<   edz  ed<   dS )VisionChunkImagez.Represents an image wrapped as a vision chunk.imagetypeNuuid)__name__
__module____qualname____doc__r   __annotations__r   str     j/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/multimodal/inputs.pyr,   r,   l   sB         88
'
LLL
*r7   r,   c                   d    e Zd ZU dZed         ed<   ee         ed<   edz  ed<   eed<   e	ed<   dS )VisionChunkVideoz'Represents a video chunk with metadata.video_chunkr.   Nr/   prompt	video_idx)
r0   r1   r2   r3   r   r4   listr   r5   intr6   r7   r8   r:   r:   t   sY         11
-
    e
*KKKNNNNNr7   r:   c                   r    e Zd ZU dZee         ed<   	 ee         ed<   	 ee         ed<   	 ee	         ed<   dS )MultiModalDataBuiltinsz7Type annotations for modality types predefined by vLLM.r-   videoaudiovision_chunkN)
r0   r1   r2   r3   r*   r'   r4   r(   r)   VisionChunkr6   r7   r8   rA   rA      se         AA	""""	""""	""""{++++RRr7   rA   F)totalMultiModalDataDictMultiModalUUIDDictT)frozenc                       e Zd ZU dZeed<   	 eed<   	 dZed         ed<   	 ede	j
        dz  fd            Zedefd	            Zd
ededeeef         fdZdeeeef                  fdZdedefdZdS )PlaceholderRangea  
    Placeholder location information for multi-modal data.

    Example:

    Prompt: `AAAA BBBB What is in these images?`

    Images A and B will have:

    ```
    A: PlaceholderRange(offset=0, length=4)
    B: PlaceholderRange(offset=5, length=4)
    ```
    offsetlengthNr#   is_embedreturnc                 J    | j         d n| j                             d          S )Nr   dim)rN   cumsumselfs    r8   embeds_cumsumzPlaceholderRange.embeds_cumsum   s&    },tt$-2F2F12F2M2MMr7   c                 R    | j         | j        S t          | j         d                   S )N)rV   rM   r?   rT   s    r8   get_num_embedszPlaceholderRange.get_num_embeds   s)    %;4%b)***r7   	start_idxend_idxc                     | j         ||fS |dk    rt          | j         |dz
                     nd}t          | j         |dz
                     }||fS )a  
        Returns the starting and ending indices of the embeddings of encoder outputs
        in the range of [start_idx, end_idx) in the placeholders.

        For example, given:
        PlaceholderRange(offset=2, length=5, is_embed=[False, True, False, True, True])

        If start_idx=3 and end_idx=5, the output is (1, 3) because we want to get
        the second and the third embeddings from the encoder output.
        Nr   r   )rV   r?   )rU   rZ   r[   embeds_start_idxembeds_end_idxs        r8   get_embeds_indices_in_rangez,PlaceholderRange.get_embeds_indices_in_range   sj     %g%% 7@!mmC"9q=1222 	 T/!<==//r7   c                 6   | j         | j        | j        | j        z   dz
  fgS | j                                         }t	          j        t	          j        ||                    d                    dk                                              }t	          j        t	          j        ||                    d                    dk                                              }t	          j	        ||fd          | j        z   }d |
                                D             S )a  Extract the start and end indices of the embedded region in prompt.

        For example, given `PlaceholderRange(offset=2, length=5)` and
        `is_embed = [False, True, False, True, True]`, the output is
        `[(1 + offset, 1 + offset), (3 + offset, 4 + offset)]`.

        Returns:
            A tuple `(start, end)` representing the start and end
            indices (inclusive) of the embedded region.
            Returns full placeholder range if `is_embed` is `None`.
        Nr   )prepend)appendrX   rQ   c                 ,    g | ]}t          |          S r6   )tuple).0xs     r8   
<listcomp>z9PlaceholderRange.extract_embeds_range.<locals>.<listcomp>   s    222Qa222r7   )rN   rL   rM   r?   r!   nonzerodiff	new_zerosflattenstacktolist)rU   mask_istartsendsrangess        r8   extract_embeds_rangez%PlaceholderRange.extract_embeds_range   s     = [$+";a"?@AA""$$Jvv'7'7':':;;;q@
 

')) 	 }Jvf&6&6q&9&9:::b@
 

')) 	 fd^333dkA22&--//2222r7   otherc                     t          || j                  sdS | j        | j        f|j        |j        fk    sdS | j        	|j        d u S |j        	| j        d u S t          | j        |j                  S NF)
isinstance	__class__rL   rM   rN   nested_tensors_equal)rU   rs   s     r8   __eq__zPlaceholderRange.__eq__   s{    %00 	5T[)elEL-III5= >T))>!=D((#DM5>BBBr7   )r0   r1   r2   r3   r?   r4   rN   r   r
   r!   TensorrV   rY   rd   r_   r>   rr   objectboolry   r6   r7   r8   rK   rK      s4          KKK;KKK()-Hh~&---
 Nu|d2 N N N _N + + + + _+00'*0	sCx0 0 0 0.3d5c?&; 3 3 3 32CF Ct C C C C C Cr7   rK   NestedTensors)r#   .abrO   c                 v   t          | t          j                  r/t          |t          j                  ot          j        | |          S t          |t          j                  r/t          | t          j                  ot          j        ||           S t          | t                    r<t          |t                    o&t          d t          | |          D                       S t          |t                    r<t          | t                    o&t          d t          ||           D                       S | |k    S )ze
    Equality check between
    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.
    c              3   <   K   | ]\  }}t          ||          V  d S Nrx   )re   a_b_s      r8   	<genexpr>z'nested_tensors_equal.<locals>.<genexpr>"  B       +
 +
-3R R((+
 +
 +
 +
 +
 +
r7   c              3   <   K   | ]\  }}t          ||          V  d S r   r   )re   r   r   s      r8   r   z'nested_tensors_equal.<locals>.<genexpr>&  r   r7   )rv   r!   rz   equalr>   allzipr~   r   s     r8   rx   rx     s3   
 !U\"" A!U\**@u{1a/@/@@	Au|	$	$ A!U\**@u{1a/@/@@!T 
!T"" 
s +
 +
7:1ayy+
 +
 +
 (
 (
 	
 !T 
!T"" 
s +
 +
7:1ayy+
 +
 +
 (
 (
 	

 6Mr7   tensorsdevicec                 2    | S t          fd|           S )Nc                 j    t          | t          j                  r|                     d          n| S )NT)r   non_blocking)rv   r!   rz   to)rf   r   s    r8   <lambda>z%_nested_tensors_h2d.<locals>.<lambda>7  s4    !U\**add&td<<< r7   r   )r   r   s    `r8   _nested_tensors_h2dr   .  s:     ~    	  r7   BatchedTensorInputsc                 >     t           fd D                       S )zq
    Equality check between
    [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
    c              3   X   K   | ]$}|v ot          |         |                   V  %d S r   r   )re   kr~   r   s     r8   r   z(batched_tensors_equal.<locals>.<genexpr>K  s?      FFqqAv:.qtQqT::FFFFFFr7   )r   r   s   ``r8   batched_tensors_equalr   F  s,    
 FFFFFAFFFFFFr7   c                       e Zd ZU dZed         ed<   	 eed<   	 eed<   	 eed<   	 dZedz  ed<   	 e	d	e
d          d
ee         fd            ZdS )MultiModalFeatureSpeca  
    Represents a single multimodal input with its processed data and metadata.

    Used by the V1 engine to track multimodal data through processing and
    caching. A request containing multiple multimodal items will have one
    MultiModalFeatureSpec per item.
    MultiModalKwargsItemdatamodality
identifiermm_positionNmm_hashfeatureskeysc                     t          t          t          t                   f         t                    }| D ]:}|j        }|/|D ],}||v r&||                             ||         j                   -;t          |          S r   )r   r5   r>   r}   r   rb   dict)r   r   kwargsfitemr   s         r8   gather_kwargsz#MultiModalFeatureSpec.gather_kwargsg  s~    S$}"556t<< 	7 	7A6D 7 7ADyyq	((a666F||r7   )r0   r1   r2   r3   r   r4   r5   rK   r   staticmethodr>   setr   r6   r7   r8   r   r   N  s           )
*****MMM>OOO6!!!!6GS4ZA
%< = 
SX 
 
 
 \
 
 
r7   r   c                   T    e Zd ZU dZeed<   	 eed<   	 eed<   	 ded<   	 dedefd	Z	d
S )MultiModalFieldElemz{
    Represents a keyword argument inside a
    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem].
    r   keyr   BaseMultiModalFieldfieldrs   rO   c                 8   t          || j                  sdS | j        
|j        d u }n+|j        
| j        d u }nt          | j        |j                  }| j        | j        f|j        |j        fk    o)|o't          | j                  t          |j                  u S ru   )rv   rw   r   rx   r   r   r.   r   )rU   rs   
data_equals      r8   ry   zMultiModalFieldElem.__eq__  s    %00 	59t+JJZd*JJ-diDDJ ]DH%%.%))DD 66TZ  D$5$55	
r7   N)
r0   r1   r2   r3   r5   r4   r}   r{   r|   ry   r6   r7   r8   r   r   u  s          
 MMM
 
HHH  !   

F 
t 
 
 
 
 
 
r7   r   )rI   kw_onlyc            
           e Zd ZU dZdZeed<   	 dedefdZe	dedede
dee         fd	            Ze	d
ee
         dede
fd            Zddddee         dej        j        dede
fdZdS )r   z
    Defines how to interpret tensor data belonging to a keyword argument for
    [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems],
    and vice versa.
    Fkeep_on_cpur   r   c                `    t          t          |||           dt          dt          ffd}|S )N)r   r   r   r   rO   c                      |           S )Nr   r6   )r   r   s    r8   factoryz3BaseMultiModalField._field_factory.<locals>.factory  s    1$<<<r7   )r   r   r}   )rU   r   r   r   r   s       @r8   _field_factoryz"BaseMultiModalField._field_factory  sW    	
 
 
	 - 	 ,? 	  	  	  	  	  	  r7   r   rO   c                     t           )a
  
        Construct
        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
        instances to represent the provided data.

        This is the inverse of
        [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
        NotImplementedError)rU   r   r   r   s       r8   build_elemszBaseMultiModalField.build_elems  s
     "!r7   batch
pin_memoryc                    t           r   r   rU   r   r   s      r8   _reduce_dataz BaseMultiModalField._reduce_data  s
     "!r7   Nr   r   elemsr   c                   d |D             }t          t          |                    dk    rt          d|          |	| j        rd}|r	| j        rd}d |D             }|                     ||          }t          ||	          S )
z
        Merge the data from multiple instances of
        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].

        This is the inverse of
        [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
        c                 6    g | ]}t          |j                  S r6   )r.   r   )re   r   s     r8   rg   z3BaseMultiModalField.reduce_data.<locals>.<listcomp>  s"    :::DtDJ'':::r7   r   z#Cannot merge different field_types=NcpuFc                     g | ]	}|j         
S r6   r   re   elems     r8   rg   z3BaseMultiModalField.reduce_data.<locals>.<listcomp>  s    ---t---r7   )r   )r   )lenr   
ValueErrorr   r   r   )rU   r   r   r   field_typesr   outs          r8   reduce_datazBaseMultiModalField.reduce_data  s     ;:E:::s;  1$$E{EEFFF$"2F 	$* 	J--u---*=="3v6666r7   )r0   r1   r2   r3   r   r|   r4   r5   r   r   r}   r   r   r   r>   r   r!   typesDevicer   r6   r7   r8   r   r     s=          K
# C     "" " 	"
 
%	&" " " ^"  "M"" 	"
 
" " " ^" &* 7 7 7'(7 "	7
 7 
7 7 7 7 7 7r7   r   c            	       V    e Zd ZdZdedededee         fdZde	e         de
defd	Zd
S )MultiModalBatchedFieldzo
    Info:
        [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
    r   r   r   rO   c                 N    |                      ||          fd|D             S )Nr   r   c                 &    g | ]} |          S r6   r6   )re   r   field_factorys     r8   rg   z6MultiModalBatchedField.build_elems.<locals>.<listcomp>  s#    555d##555r7   )r   rU   r   r   r   r   s       @r8   r   z"MultiModalBatchedField.build_elems  s7     ++X3+GG55555555r7   r   r   c                B   t          |          dk    r	t          |t          j        d          rt	          t
          t          j                 |          }t          |          dk    r-|d                             d                                          S |d         j        t          fd|D                       r`t          j
        t          |          g|d         j        R |d         j        |d         j        |          }t          j        ||          S |S )Nr   r   checkr   c              3   .   K   | ]}|j         k    V  d S r   shape)re   r   first_shapes     r8   r   z6MultiModalBatchedField._reduce_data.<locals>.<genexpr>  s*      ??4:,??????r7   dtyper   r   )r   )r   r   r!   rz   r   r>   	unsqueeze
contiguousr   r   emptyr   r   rl   )rU   r   r   r   r   s       @r8   r   z#MultiModalBatchedField._reduce_data  s    u::>>jEJJJ>el+U33E5zzQ Qx))!,,77999(.K????????? 3kZZ1%(.11(. 8?)	   {5c2222r7   N)r0   r1   r2   r3   r5   r}   r   r   r   r>   r|   r   r6   r7   r8   r   r     s         
66 6 	6
 
%	&6 6 6 6M" 	
 
     r7   r   c            	           e Zd ZU dZee         eee                  z  ed<   dZeed<   de	de	de
dee         fd	Zd
ee
         dede
fdZdS )MultiModalFlatFieldz
    Info:
        [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
    slicesr   rR   r   r   r   rO   c                     |                      ||          t          | j        t          d          s$t	          t
          j                  s
J d            fd| j        D             S )Nr   r   r   z,torch.Tensor is required for multiple slicesc           	      X    g | ]&} t          t          |                             'S r6   )r   slice)re   sr   r   s     r8   rg   z3MultiModalFlatField.build_elems.<locals>.<listcomp>=  s0    IIId4q>>233IIIr7   )r   r   r   r   rv   r!   rz   r   s      `@r8   r   zMultiModalFlatField.build_elems2  s     ++X3+GG$+uE::: 	dEL11  > 1 JIIIIT[IIIIr7   r   r   c                  	 t          |          dk    rMt          |t          j        d          r0t	          t
          t          j                 |          }t          |          dk    r|d                                         S | j        | j        dk     t          |d         j                  z  z   dt          j        ffd |d                   	t          	fd|D                       rq	\  }}t          fd|D                       }t          j        g |||R |d         j        |d         j        |	          }t          j        || j        |
          S | j        dk    s
J d            d |D             S )Nr   r   r   r   tensorc                 F    | j         d          | j         dz   d          fS Nr   r   )r   rR   s    r8   _shape_before_afterz=MultiModalFlatField._reduce_data.<locals>._shape_before_afterO  s(    |DSD)6<a		+BBBr7   c              3   6   K   | ]} |          k    V  d S r   r6   )re   r   r   r   s     r8   r   z3MultiModalFlatField._reduce_data.<locals>.<genexpr>T  s4      NN&&t,,;NNNNNNr7   c              3   2   K   | ]}|j                  V  d S r   r   )re   r   rR   s     r8   r   z3MultiModalFlatField._reduce_data.<locals>.<genexpr>V  s)      "E"Et4:c?"E"E"E"E"E"Er7   r   )rR   r   z$dim == 0 is required for nested listc                     g | ]	}|D ]}|
S r6   r6   )re   r   es      r8   rg   z4MultiModalFlatField._reduce_data.<locals>.<listcomp>`  s%    222dT222222r7   )r   r   r!   rz   r   r>   r   rR   r   r   sumr   r   r   concat)
rU   r   r   shape_beforeshape_aftershape_concatr   r   rR   r   s
          @@@r8   r   z MultiModalFlatField._reduce_data?  s    u::>>jEJJJ>el+U33E5zzQ Qx**,,,(dhlc%(..A.AAACCEL C C C C C C .-eAh77KNNNNNNNNNN 	B,7)k""E"E"E"Eu"E"E"EEEk?l?L?;??(. 8?)	   |EtxSAAAAx1}}}D}}}22e2222r7   N)r0   r1   r2   r3   r   r   r4   rR   r?   r5   r}   r   r   r>   r|   r   r6   r7   r8   r   r   '  s           UOhx77777CLLLJJ J 	J
 
%	&J J J J!3M"!3 	!3
 
!3 !3 !3 !3 !3 !3r7   r   c            	       b    e Zd ZU dZeed<   dedededee	         fdZ
dee         d	edefd
ZdS )MultiModalSharedFieldzm
    Info:
        [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
    
batch_sizer   r   r   rO   c                 X    |                      ||          } ||          g| j        z  S )Nr   )r   r   r   s        r8   r   z!MultiModalSharedField.build_elemsl  s6     ++X3+GGd##$t66r7   r   r   c                    |d         S )Nr   r6   r   s      r8   r   z"MultiModalSharedField._reduce_datau  s     Qxr7   N)r0   r1   r2   r3   r?   r4   r5   r}   r   r   r   r>   r|   r   r6   r7   r8   r   r   c  s          
 OOO77 7 	7
 
%	&7 7 7 7M" 	
 
     r7   r   c                   "   e Zd ZU edddedefd            Ze	 ddddedee         eee                  z  de	defd	            Z
e	 dddded
dde	defd            Zedddede	defd            Zeed<   eed<   dededee         fdZdS )MultiModalFieldConfigFr   r   r   c                @    t          t          |          |           S )aN  
        Defines a field where an element in the batch is obtained by
        indexing into the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Input:
            Data: [[AAAA]
                [BBBB]
                [CCCC]]

        Output:
            Element 1: [AAAA]
            Element 2: [BBBB]
            Element 3: [CCCC]
        ```
        r  r   r   )r  r   )r   r   s     r8   batchedzMultiModalFieldConfig.batched  s,    2 %([AAA
 
 
 	
r7   r   r   rR   c                D    t          t          |||          |           S )a  
        Defines a field where an element in the batch is obtained by
        slicing along the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            slices: For each multi-modal item, a slice (dim=0) or a tuple of
                slices (dim>0) that is used to extract the data corresponding
                to it.
            dim: The dimension to extract data, default to 0.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Given:
            slices: [slice(0, 3), slice(3, 7), slice(7, 9)]

        Input:
            Data: [AAABBBBCC]

        Output:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
        ```

        ```
        Given:
            slices: [
                (slice(None), slice(0, 3)),
                (slice(None), slice(3, 7)),
                (slice(None), slice(7, 9))]
            dim: 1

        Input:
            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]

        Output:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
        ```
        )r   rR   r   r  )r  r   )r   r   rR   r   s       r8   flatzMultiModalFieldConfig.flat  s<    j %%'  
 
 
 
 	
r7   size_per_itemr#   c                    |j         dk    rt          d|j                   dgt          |          fdt	          t          |                    D             }t                              | ||          S )a  
        Defines a field where an element in the batch is obtained by
        slicing along the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            size_per_item: For each multi-modal item, the size of the slice
                that is used to extract the data corresponding to it.
            dim: The dimension to slice, default to 0.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Given:
            size_per_item: [3, 4, 2]

        Input:
            Data: [AAABBBBCC]

        Output:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
        ```

        ```
        Given:
            size_per_item: [3, 4, 2]
            dim: 1

        Input:
            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]

        Output:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
        ```

        Info:
            [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        r   z7size_per_item should be a 1-D tensor, but found shape: r   c           	      |    g | ]8}t          d d d           fz  t          |         |dz                      fz   9S r   )r   )re   irR   
slice_idxss     r8   rg   z9MultiModalFieldConfig.flat_from_sizes.<locals>.<listcomp>  sa     
 
 
  4t$$&,Z]Jq1u$56689
 
 
r7   )rR   r   )ndimr   r   r   ranger   r  r  )r   r  rR   r   r   r  s     `  @r8   flat_from_sizesz%MultiModalFieldConfig.flat_from_sizes  s    j "":$1$7: :  
 4*]334

 
 
 
 
 3}--..
 
 
 %))#	 * 
 
 	
r7   r   c                B    t          t          ||          |           S )a  
        Defines a field where an element in the batch is obtained by
        taking the entirety of the underlying data.

        This means that the data is the same for each element in the batch.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            batch_size: The number of multi-modal items which share this data.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Given:
            batch_size: 4

        Input:
            Data: [XYZ]

        Output:
            Element 1: [XYZ]
            Element 2: [XYZ]
            Element 3: [XYZ]
            Element 4: [XYZ]
        ```
        )r   r   r  )r  r   )r   r   r   s      r8   sharedzMultiModalFieldConfig.shared%  s9    F %'%'   
 
 
 	
r7   r   r   r   rO   c                 D    | j                             | j        ||          S r   )r   r   r   )rU   r   r   s      r8   r   z!MultiModalFieldConfig.build_elemsS  s     
 z%%dmS%@@@r7   N)r   )r0   r1   r2   r   r5   r|   r  r   r   r?   r  r  r  r   r4   r}   r   r   r6   r7   r8   r  r  ~  s        6; 
 
 
# 
t 
 
 
 \
:  ;

 ";
 ;
 ;
;
(8E?";;;
 ;

 ;
 ;
 ;
 \;
z  F

 "F
 F
 F
F
%F
 F

 F
 F
 F
 \F
P 
 "	(
 (
 (
(
(
 	(
 (
 (
 \(
T MMMAA A 
%	&	A A A A A Ar7   r  c                        e Zd ZdZeddedefd            Zedee	         fd            Z
i fdeee	f         d	d
f fdZed	efd            Zd	eeef         fdZ xZS )r   z
    A collection of
    [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
    corresponding to a data item in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    r   r   nbytesc                     t          | dt          j        |t          j                  t	          d                    }t
                              |g          S )zConvenience class for testing.dummy)r   r   )r   )r   r   r   r   )r   r!   r   uint8r   r   
from_elems)r   r  mm_elems      r8   r  zMultiModalKwargsItem.dummyc  sX     &V5;777'1555	
 
 
 $..y999r7   r   c                 4    t          d | D                       S )Nc                     i | ]
}|j         |S r6   r   r   s     r8   
<dictcomp>z3MultiModalKwargsItem.from_elems.<locals>.<dictcomp>p  s    $F$F$FTXt$F$F$Fr7   )r   )r   s    r8   r  zMultiModalKwargsItem.from_elemsn  s    #$F$F$F$F$FGGGr7   r   rO   Nc                    t                                          |           d |                                 D             }t          |          dk    sJ d|             t	          t          |                    | _        d S )Nc                     h | ]	}|j         
S r6   r   r   s     r8   	<setcomp>z0MultiModalKwargsItem.__init__.<locals>.<setcomp>u  s    >>>dm>>>r7   r   zFound different modalities=)super__init__valuesr   nextiter	_modality)rU   r   
modalitiesrw   s      r8   r#  zMultiModalKwargsItem.__init__r  sy    >>>>>
:!###%O:%O%O###d:..//r7   c                     | j         S r   )r'  rT   s    r8   r   zMultiModalKwargsItem.modalityy  s
    ~r7   c                 >    d |                                  D             S )Nc                 $    i | ]\  }}||j         S r6   r   )re   r   r   s      r8   r  z1MultiModalKwargsItem.get_data.<locals>.<dictcomp>~  s     ===93TY===r7   )itemsrT   s    r8   get_datazMultiModalKwargsItem.get_data}  s    ==

====r7   )r   )r0   r1   r2   r3   r   r5   r?   r  r   r   r  r   r#  propertyr   r   r}   r-  __classcell__rw   s   @r8   r   r   [  s         : : :S : : : \: H(#67 H H H \H BD 0 0WS*=%=> 0 0 0 0 0 0 0 #    X>$sM12 > > > > > > > >r7   r   _I)defaultc                        e Zd ZdZedddeeef         fd            Zede	e
         fd            Zded	e	e         f fd
ZddZddddej        j        ded	efdZ xZS )MultiModalKwargsItemszu
    A dictionary of
    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
    by modality.
    	hf_inputsr   config_by_keyc                 Z   t          t          t          t                   f                     t	          t          t
          t                   f         t
                    }|                                D ]j\  }}|                     |          }|N|                    ||          }t          |          dk    r%||<   ||j
                                     |           kt          t                               }|                                D ]\  }}	fd|	D             }
d |
                                D             }t          t          |                                                    dk    rt          d|d|          t!          t#          |                                                    }t%          |          D ]Ofd|
                                D             }|                    t                              |                     Pt*                              |          S )Nr   c                 "    i | ]}||         S r6   r6   )re   r   elems_by_keys     r8   r  z8MultiModalKwargsItems.from_hf_inputs.<locals>.<dictcomp>  s     B B BLO B B Br7   c                 4    i | ]\  }}|t          |          S r6   )r   )re   r   vs      r8   r  z8MultiModalKwargsItems.from_hf_inputs.<locals>.<dictcomp>  s$    KKKA1c!ffKKKr7   r   z0Cannot merge different batch sizes for modality=z! Found: batch_sizes=c                      g | ]
}|         S r6   r6   )re   r;  item_idxs     r8   rg   z8MultiModalKwargsItems.from_hf_inputs.<locals>.<listcomp>  s    III8IIIr7   )r   r5   r   r   r   r   r,  getr   r   r   addr>   r   r$  r   r%  r&  r  rb   r  r4  from_seq)r5  r6  keys_by_modalityr   configr   r   r,  r   r   elems_in_modalitybatch_sizesr   r9  r=  s                @@r8   from_hf_inputsz$MultiModalKwargsItems.from_hf_inputs  s!    C*=!>>?AA&sCH}5c::(..00 	? 	?KCMM#&&E **366u::>>(-L%$V_599#>>>)*,,.4466 	E 	ENHd B B B BT B B BKK1B1H1H1J1JKKKK3{))++,,--11 -h - -)- -  
 d;#5#5#7#78899J!*-- E EIIII.?.F.F.H.HIII1<<UCCDDDDE %--e444r7   r,  c                 D    t          | d           }t          |          S )Nc                     | j         S r   r   )rf   s    r8   r   z0MultiModalKwargsItems.from_seq.<locals>.<lambda>  s    aj r7   r  )r   r4  )r,  items_by_modalitys     r8   r@  zMultiModalKwargsItems.from_seq  s(    (4H4HIII$%6777r7   r   rO   c           	          || vr4t          d|dt          |                                                      t                                          |          S )Nz	Modality z" not found. Available modalities: )KeyErrorr   r   r"  __getitem__)rU   r   rw   s     r8   rK  z!MultiModalKwargsItems.__getitem__  sh    4<H < <),TYY[[)9)9< <  
 ww""8,,,r7   +MultiModalKwargsItems[MultiModalKwargsItem]c           	          |                                  D ]2\  }}t          |          D ]\  }}|t          d| d| d          3| S )NzFound empty mm_items[][])r,  	enumerateRuntimeError)rU   r   r,  r  r   s        r8   require_dataz"MultiModalKwargsItems.require_data  su    #zz|| 	Q 	QOHe$U++ Q Q4<&'Ox'O'O1'O'O'OPPP  Q r7   NFr   r   r   c          	         t          t          t          t                   f         t                    }|                                 D ]g\  }}t          |          D ]R\  }}|t          d| d| d          |                                D ] \  }}	||                             |	           !Shfd|                                D             }
|
S )zAConstruct a dictionary of keyword arguments to pass to the model.Nz&Cannot build data from empty mm_items[rN  rO  c                 ^    i | ])\  }}||d          j                             |          *S )r   r   )r   r   )re   r   r   r   r   s      r8   r  z2MultiModalKwargsItems.get_data.<locals>.<dictcomp>  sU     
 
 
 U q++% ,  
 
 
r7   )r   r5   r>   r   r,  rP  rQ  rb   )rU   r   r   r9  r   r,  r  r   r   r   r   s    ``        r8   r-  zMultiModalKwargsItems.get_data  s    #3-@(A#AB4HH#zz|| 	3 	3OHe$U++ 3 34<&QQQQQQQ   "& 3 3IC %,,T222233
 
 
 
 
 +0022
 
 
 r7   )rO   rL  )r0   r1   r2   r3   r   r   r5   r  rE  r   r   r@  r1  rK  rR  r!   r   r   r|   r   r-  r/  r0  s   @r8   r4  r4    s          5! 5s$99: 5  5  5 \ 5D 8!56 8 8 8 \8-C -HRL - - - - - -    &* 	   " 	
 
       r7   r4  MultiModalKwargsOptionalItemsMultiModalPlaceholderDictc                   ~    e Zd ZU dZed         ed<   	 ee         ed<   	 eed<   	 e	ed<   	 e
ed<   	 ee         ed<   d	S )
MultiModalInputsz
    Represents the outputs of
    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
    ready to be passed to vLLM internals.
    
multimodalr.   prompt_token_ids	mm_kwargs	mm_hashesmm_placeholders
cache_saltN)r0   r1   r2   r3   r   r4   r>   r?   rU  MultiModalHashesrV  r   r5   r6   r7   r8   rX  rX    s           ,
3iD,,,,N-....
 C     r7   rX  c                   *    e Zd ZU dZee         ed<   dS )MultiModalEncDecInputsz
    Represents the outputs of
    [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
    ready to be passed to vLLM internals.
    encoder_prompt_token_idsN)r0   r1   r2   r3   r>   r?   r4   r6   r7   r8   ra  ra    s.           #3i'''88r7   ra  )\abcr   r   collectionsr   r   collections.abcr   r   dataclassesr	   	functoolsr
   r   	itertoolsr   typingr   r   r   r   r   r   r   r   r   numpynp	PIL.Imager   typing_extensionsr   r   vllm.utils.collection_utilsr   r   vllm.utils.import_utilsr   vllm.utils.jsontreer   r!   torch.types%transformers.feature_extraction_utilsr   mediar    globalsr"   ndarrayr$   r4   r>   r%   floatr&   r'   rd   r   r5   r(   r)   r*   r,   r:   rE   rA   rG   rH   rK   r}   r|   rx   r   r   r   r   r   r   r   r   r   r   r   r  r   r1  r4  rU  r_  rV  rX  ra  r6   r7   r8   <module>rw     s   $ # # # # # # # # - - - - - - - - - - - - - - - - ! ! ! ! ! ! . . . . . . . .            
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
           2 2 2 2 2 2 2 2 @ @ @ @ @ @ @ @ . . . . . . / / / / / / 4LLLBBBBBB%%%%%%%Jw		733EWT]]w
NBCY C C C
 M2:~tBJ/?nAUUY   
 tE{BJFGY G G G
 [.:WWX	9 X X X {DcN'B!CC	9    [%
E0A*BNRS	9 S S S tBI.5i 5 5 5    y       y    !11 9 S S S S SYe S S S S  !(\#->(> ? I ? ? ? !(T#*-=-C(C D I D D D $bC bC bC bC bC bC bC bCJ !	
 y   
M m     .K    " "&c=&8!9 Y 9 9 9G0 G5H GT G G G G # # # # # # # #L 2
 2
 2
 2
 2
 2
 2
 2
j $%%%N7 N7 N7 N7 N7# N7 N7 &%N7b $%%%& & & & &0 & & &%&R $%%%83 83 83 83 83- 83 83 &%83v $%%%    /   &%4 $YA YA YA YA YA YA YA YAx#> #> #> #> #>8C)<$<= #> #> #>L W4 	  [ [ [ [ [HS(2,%67 [ [ [~ ./04789 y    T#Y' 
 (/sH=M4N/N'O 9 O O O
    y   >9 9 9 9 9- 9 9 9 9 9r7   