
    .`i              
      X   d dl mZ d dlZd dlmZmZmZmZ d dlm	Z	m
Z
 d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl"m$Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZCmDZDmEZEmFZF d dlGmHZHmIZI d dlJmKZKmLZLmMZMmNZNmOZOmPZP d dlQmRZR d dlSmTZT d dlUmVZV d dlWmXZX d dlYmZZZm[Z[ d dl\m]Z]m^Z^ dd l_m`Z` dqd'Za G d( d)ejb                  Zc	 	 drdsd.Zd	 dtdud0Zed1Zfd2Zgd2Zhd3Zidvd6Zjdwd:Zk	 	 	 dxdydFZl ejm        egejn        G          o                    ddddH          Zp ejm        ehejn        G          o                    ddddH          ZqdzdMZreifd{dOZsd|dPZt	 	 d}d~dSZu G dT dUedVW          Zv G dX dY          Zw G dZ d[          Zx G d\ d]eM          Zy G d^ d_eKey                   Zz G d` dae]          Z{ G db dceL          Z| G dd deejb                  Z} G df dgejb                  Z~ G dh diejb                  Z G dj dkejb                  Z G dl dmejb                  Z eAj        e|eyezn           G do dpejb        e5e3e6e4                      ZdS )    )annotationsN)IterableIteratorMappingSequence)	AnnotatedAny	rearrange)BatchFeature)
TensorType)	TypedDictUnpack)
VllmConfig)ModelConfig)parallel_state)utils)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)MultiModelKeys)	SiglipMLP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)get_tokenizer)get_cached_tokenizer)patch_rope_parameters)IsaacConfigPixelShuffleSiglip2VisionConfig)TensorSchemaTensorShape   )is_vit_use_data_parallel	seq_sizestorch.Tensordevicetorch.devicereturn!tuple[torch.Tensor, torch.Tensor]c                0   t          j        t          |           dz   t           j        |          }|                     d          |dd<   t          |           dk    r|                                 n t          j        dt           j        |          }||fS )zACreate cumulative sequence lengths for variable-length attention.r;   )dtyper?   r   N)torchzeroslenint32cumsummaxtensor)r=   r?   
cu_seqlens
max_seqlens       t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/isaac.pycreate_cumulative_seq_lengthsrO   N   s     S^^a/u{6RRRJ%%a((JqrrN y>>A 	\!5;v>>> 
 z!!    c                  0     e Zd Zd
 fdZddZdd	Z xZS )!Siglip2VariableSequenceEmbeddingsconfigr8   c                   t                                                       || _        |j        | _        |j        | _        t          |j        | j        z  | j        z  | j        d          | _        |j	        | _	        t          | j	        dz            | _        t          j        | j	        | j                  | _        d S )NF)
input_sizeoutput_sizereturn_bias      ?)super__init__rS   hidden_size	embed_dim
patch_sizer   num_channelspatch_embeddingnum_patchesintposition_embedding_sizenn	Embeddingposition_embedding)selfrS   	__class__s     rN   rZ   z*Siglip2VariableSequenceEmbeddings.__init__]   s    + +/*T_<tN 
  
  
 "-'*4+;S+@'A'A$"$,t/?"P"PrP   packed_seq_patches/tuple[torch.Tensor, torch.Tensor, torch.Tensor]rA   r>   c                   | j         j                            | j        | j        d                              ddd                              d          }|\  }}}g }d}d}d}	|D ]}
t          |
d                   t          |
d                   }}|dk    rT|dk    rNt          j        |||f|||	          }|                    | j	        ||z            
                    dd          }nG|                    | j	        | j        | j        z            
                    dd          d ||z           }|                    |           t          j        |d	          }|S )
N   r   r;   bilinearFT)sizemodealign_corners	antialiasdim)re   weightreshaperb   permute	unsqueezera   Finterpolater\   	transposeappendrE   cat)rf   rh   positional_embeddings_seq_patches
_seq_sizesspatial_shapespos_embeds_listro   rp   rq   spatial_shapeheightwidthresized_pos_embed
pos_embedss                  rN   r}   z7Siglip2VariableSequenceEmbeddings.positional_embeddingsm   s   
 #*22,d.JB  WQ1Yq\\ 	 4F0j.	+ 	6 	6Ma 0113}Q7G3H3HEFzzeaii$%M) %"/'% % %! %6$=$=NFUN% %)Aq// "!
 %:$A$AN043OO% % )Aq//"2FUN"2%4! ""#45555 YA666
rP   c                L   |\  }}}| j         j        }|                    |j        |j                  }|                      |          }|                     |          }|                                dk    r)|                    d|                    d                    }||z   }|S )Nr?   rD      rk   )	r_   rt   tor?   rD   r}   rs   viewrn   )	rf   rh   seq_patchesr   _spatial_shapestarget_weightpatch_embedsr   
embeddingss	            rN   forwardz)Siglip2VariableSequenceEmbeddings.forward   s     4F0Z,3!nn '}/B % 
 
 ++K88//0BCC
 ""',,R1B1B21F1FGGL "J.
rP   )rS   r8   )rh   ri   rA   r>   )rh   ri   )__name__
__module____qualname__rZ   r}   r   __classcell__rg   s   @rN   rR   rR   \   sl        Q Q Q Q Q Q + + + +Z       rP   rR   token_gridsscale_factorra   torch.device | Nonec                   || j         }t          |          }|dk     rt          d          t          j                                        sq|dddf         |z  dk                                    r%|dddf         |z  dk                                    s't          d| d|                                           g }d}t          |                                 |                                d	          D ]\  }\  }}	t          j
        ||t          j        
          |z   }
|
                    ||	          }
|
                    ||	|z  |          }
|
                    ||z  ||	|z  |          }
|
                    dddd                                          }
|                    |
                    d||z                       ||z  }t          j        |d          }|S )u  
    Build a gather-index map that tells us, for every *output* token after
    pixel-shuffle, which `scale_factor**2` *input* tokens are being merged.

    Args
    ----
    seq_sizes     : (num_images,)  - #patches in each image (row-major order)
    token_grids   : (num_images,2) - (height, width) for every image
    scale_factor  : spatial down-scale factor (≥2)
    device        : (optional) overrides `seq_sizes.device`

    Returns
    -------
    gather_idx : (new_total_seq_len, scale_factor**2) int64 tensor.
                 gather_idx[i, j] is the *flat* index into the *original*
                 packed sequence for the j-th sub-patch that forms the
                 i-th output token.
    Nrl   u   `scale_factor` must be ≥ 2r   r;   z?Every (H,W) in `token_grids` must be divisible by scale_factor=z, got F)strictr   r   rk   rr   )r?   ra   
ValueErrorrE   compileris_compilingallAssertionErrortolistziparangeint64r   rv   
contiguousr{   ru   r|   )r=   r   r   r?   rgather_chunks
tok_offsetseq_lenhwgrid
gather_idxs               rN   create_pixel_shuffle_index_mapr      s   0 ~!LA1uu7888 >&&(( 
	QQQT	Q	!	#((**
0;AAAqD0AA0E0J/O/O/Q/Q
 << <%0%7%7%9%9< <
 
 	

 )+MJy//11;3E3E3G3GPUVVV  !Q|GF%+FFFSyyA yyAFA&&yyaAFA..||Aq!Q''2244T\\"a!e44555g

 =a000JrP   xc                   |                                  dk    }|r>|                     d          dk    rt          d          |                     d          }n| }|                    d          }t	          |          }t          j        |d          }t          ||||j                  }||         }	|		                    |	                    d          ||z  |z            }
|r|

                    d          }
|
S )ao  Apply pixel shuffle to a packed vision sequence without unpacking per image.

    Args:
        x (`torch.Tensor`):
            Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or
            `(1, seq_len, hidden_size)` shapes produced by stacking image
            patches.
        token_grids (`torch.Tensor`):
            Integer tensor of shape `(num_images, 2)` whose rows give the
            `(height, width)` patch grid sizes corresponding to each image
            segment inside `x`.
        scale_factor (`int`, *optional*, defaults to 1):
            Spatial down-sampling factor specific to pixel shuffle. Values
            greater than one merge `scale_factor**2` neighboring patches into a
            single embedding channel-group.

    Returns:
        `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input
        convention: `(seq_len, hidden_size * scale_factor**2)` when the input
        was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` if the
        singleton batch dimension was present.

    Raises:
        ValueError: If more than one batch item is provided.
    r   r   r;   z3Packed sequence is expected to have batch_size == 1rk   rr   )r=   r   r   r?   )rs   rn   r   squeezera   rE   prodr   r?   ru   rw   )r   r   r   keep_batch_dimx_r\   r   r=   r   gatheredouts              rN   pixel_shuffle_varlenr      s   < UUWW\N 66!99>> !VWWWYYq\\ILA 
;B///I 0y	  J *~H 

8==++Y]Q->
?
?C  mmAJrP   i )rX   rX   rX   gp?arr
np.ndarrayc                    | j         j        r| S 	 |                     d           | S # t          $ r |                                 cY S w xY w)a  Return *arr* itself if it is already writeable, otherwise try to flip the
    write flag in-place and finally fall back to `arr.copy()`.
    This guarantees the buffer handed to `torch.from_numpy()` is always
    writeable, silencing the PyTorch warning about undefined behaviour.
    T)write)flags	writeablesetflagsr   copy)r   s    rN   _make_writeabler   =  sc     y 
4   
   xxzzs   ( A	A	imagePIL.Image.Imagetorch.Tensor | Nonec           	     4   | j         | j        z  t          k    r(t          d| j          d| j         dt           d          | j        dk    r| n|                     d          }t          j        |          }t          |          }t          j
        |          S )Nz	Image (w=z, h=z	) > MAX=``RGB)r   r   
MAX_PIXELSr   ro   convertnpasarrayr   rE   
from_numpy)r   imgr   s      rN   extract_image_pilr   P  s    {U\!J..MMMMM
MMM
 
 	
 :&&%%EMM%,@,@C
*S//C
#

CC   rP   h㈵>image_heightimage_widthr]   max_num_patchesmin_num_patches
int | Noneepsfloatpixel_shuffle_scaletuple[int, int]c                   d }||z  }t          j        | |z            |z  }	t          ||	          }	t          j        ||z            |z  }
t          ||
          }
|	|z  |
|z  z  }|y||k     rsd\  }}||z
  |k    rC||z   dz  } ||| ||          } |||||          }||z  ||z  z  }||k    r|}n|}||z
  |k    C|} ||| ||          } |||||          }||fS ||k    r|	|
fS |dz  d}}||z
  |k    rC||z   dz  } ||| ||          } |||||          }||z  ||z  z  }||k    r|}n|}||z
  |k    C|} ||| ||          } |||||          }||fS )a  Compute a target resolution whose patch grid satisfies patching parametrization.

    Args:
        image_height (`int`):
            Height in pixels of the source image prior to any resizing.
        image_width (`int`):
            Width in pixels of the source image prior to any resizing.
        patch_size (`int`):
            Size of the square patch used by the vision encoder.
        max_num_patches (`int`):
            Upper bound on `(height / patch_size) * (width / patch_size)` after
            resizing.
        min_num_patches (`int`, *optional*):
            Lower bound on the number of patches. When provided the image will
            be scaled up if necessary.
        eps (`float`, *optional*, defaults to 1e-5):
            Convergence tolerance for the internal binary search to determine
            the target dimensions.
        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
            Additional stride multiplier applied when pixel shuffle later
            reduces spatial resolution.

    Returns:
        `tuple[int, int]`: Height and width (in pixels) that are multiples of
        `patch_size * pixel_shuffle_scale` and respect both the maximum and
        optional minimum patch-count constraints.
    c                    | |z  }||z  }t          j        ||z            |z  }t          ||          }t          |          S N)mathceilrJ   ra   )scaleoriginal_sizer]   r   scaled_sizedivisors         rN   get_scaled_image_sizezAget_image_size_for_max_num_patches.<locals>.get_scaled_image_size  sM    m+22ig 566@';//;rP   N)      ?g      Y@rl   
   r   )r   r   rJ   )r   r   r]   r   r   r   r   r   r   adjusted_heightadjusted_widthr`   	scale_min	scale_maxr   target_heighttarget_widths                    rN   "get_image_size_for_max_num_patchesr   [  s   J      ..Giw 677'AO'?33OY{W455?N.11N"Z/NZ4OPK"{_'D'D)	99$,,*a/E11|Z1D M 10{J0C L ):5,:STKo--!		!	 9$,, --<-@
 
 -,;
,?
 
 l**		'	'..  #Rx9	9$,,*a/E11|Z1D M 10{J0C L ):5,:STKo--!		!	 9$,, --<-@
 
 -,;
,?
 
 l**rP   rD   rk   model_configr   vision_tokenstrc                    | j         p| j        }t          t          || j        | j        | j        p| j                            }|                    |d          d         S )N)tokenizer_modetrust_remote_coderevisionF)add_special_tokensr   )		tokenizermodelr5   r4   r   r   tokenizer_revisionr   encode)r   r   tokenizer_namer   s       rN   _resolve_vision_token_idr     sp    !+A|/AN$'6*<!4M8M		
 	
 	
 I LUCCAFFrP   r   c                    t          j        |           s|                                 } | |z  }t                              | j                  }t                              | j                  }||z
  |z  }|S )a  Standardize RGB images prior to patch extraction via rescaling and whitening.

    Args:
        image (`torch.Tensor`):
            Tensor with shape `(..., height, width, 3)` containing RGB values.
            The tensor is converted to floating point if needed.
        scale (`float`, *optional*, defaults to `VISION_SCALE`):
            Scalar multiplier applied before normalization.
    Returns:
        `torch.Tensor`: Normalized tensor with the same shape as the input and
        dtype `torch.float32`.
    )rE   is_floating_pointr   _MEAN_TENSORr   r?   _STD_TENSOR)r   r   rescaledmean_tensor
std_tensor
normalizeds         rN   prepare_image_tensorr    sj      "5)) u}H //%,//K--J[(J6JrP   c                $   | j         \  }}}}||z  s||z  rt          d| j          d| d          |                     |||z  |||z  ||          }|                    dddddd	          }|                    |||z  ||z  ||z  |z            }|S )
a  Convert normalized images into flattened ViT-style patches.

    Args:
        image (`torch.Tensor`):
            Tensor of shape `(num_images, height, width, channels)`.
        patch_size (`int`):
            Edge length of the square patches

    Returns:
        `torch.Tensor`:
            Patch tensor where each position stores the flattened pixels
            belonging to that patch.

    Raises:
        ValueError: If `height` or `width` is not divisible by `patch_size`.
    zDimensions of images z! are not divisible by patch_size=.r   r;   r   rl         )shaper   ru   rv   )r   r]   
num_imagesr   r   channelspatchess          rN   patchify_visionr    s    " +0+'Jx
 
ej0 
K{K K=GK K K
 
 	
 mm* G ooaAq!Q//Goo*:
*	 G NrP   imagestuple[torch.Tensor, list[int]]c                   |                                  dk    r|                     d          } |                     dddd          } | j        \  }}}}t	          ||||||          \  }}	t          j        | ||	fdd          } |                     dddd          } t          |           } t          | |	          }
|
j        \  }}}}|dk    rd||gn
d||z  ||z  g}|
|fS )
a  Resize, normalize, and patchify RGB images for the vision encoder.

    Args:
        images (`torch.Tensor`):
            Either `(height, width, channels)` for a single image or
            `(num_images, height, width, channels)` for a batch. Channels are
            expected to be RGB.
        patch_size (`int`):
            Edge length of square patches; implictly controls resize grid granularity.
        max_num_patches (`int`):
            Maximum number of patches allowed after resizing.
        min_num_patches (`int`, *optional*):
            Minimum number of patches. If provided, the routine upsamples images
            as needed to satisfy the lower bound.
        pixel_shuffle_scale (`int`, *optional*, defaults to 1):
            Pixel shuffle scale factor; influences the target grid that the
            function produces.

    Returns:
        `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)`
        where `patches` has shape `(num_images, target_h / patch_size, target_w
        / patch_size, channels * patch_size**2)` and `dims_virtual` encodes
        effective `(images, height, width)` dimensions after optional pixel
        shuffling.
    r   r   r;   rl   r   r   rm   F)rn   ro   rp   )r]   )	rs   rw   rv   r  r   rx   ry   r  r  )r  r]   r   r   r   _orig_height
orig_widthr   r   r
  n_images	h_patches	w_patchesdims_virtuals                  rN   process_vision_for_patchesr    s<   B zz||q!!!$$ ^^Aq!Q''F %+L!Aq+z"D'/# # #M< ]\*	  F ^^Aq!Q''F "&))F f<<<G )0%HiA !## 
Iy!!119@S3ST  L  rP   c                  8    e Zd ZU ded<   ded<   ded<   ded<   dS )IsaacImageProcessorKwargsra   r]   r   r   r   N)r   r   r   __annotations__ rP   rN   r  r  h  s?         OOOrP   r  F)totalc                  8    e Zd ZdZdZdZdZeZddgZ	d Z
ddZdS )IsaacImageProcessor   i      rl   pixel_valuesimage_grid_thwc                    |                     d| j                  | _        |                     d| j                  | _        |                     d| j                  | _        |                     dd          | _        d S )Nr]   vision_max_num_patchesvision_min_num_patchesr   rl   )popr]   r   r$  r   r%  r   rf   kwargss     rN   rZ   zIsaacImageProcessor.__init__x  sv     **\4?CC&,jj$d&:'
 '
# '-jj$d&:'
 '
# $*::.CQ#G#G   rP   r  list[torch.Tensor]return_tensorsstr | TensorType | Noner(  !Unpack[IsaacImageProcessorKwargs]rA   r   c                   g }g }|D ]}t          |          }t          || j        | j        | j        | j                  \  }}	|                    d          }|j        d         |j        d         |j        d         }}}
|
|z  }|                    ||          }d|
|g}t          j
        |                              d          }|                    |           |                    |           |r-t          j        |d          }t          j        |d          }n*t          j        dd          }t          j        dd          }t          ||d	|
          S )zEPreprocess images into format compatibile with vLLM input processing.)r]   r   r   r   r;   rk   r   rr   r   r!  r"  )datatensor_type)r   r  r]   r$  r%  r   rw   r  ru   rE   rK   r{   r|   emptyr   )rf   r  r*  r(  all_pixel_valuesall_image_gridsr   image_tensorr
  r  hpwprs   current_num_patchesr!  	dims_realr"  final_pixel_valuesfinal_image_gridss                      rN   
preprocesszIsaacImageProcessor.preprocess  s    02.0 	3 	3E,U33L$>? $ ; $ ;$($<% % %!G\ ''**G!-+W]2->b@QCB"$r'"??+>DDL BI"\)44>>qAAN##L111"">2222 	2!&+;!C!C!C %	/q A A A!&Q!2!2 %Aq 1 1 2"3  '
 
 
 	
rP   N)r  r)  r*  r+  r(  r,  rA   r   )r   r   r   r]   r   r   r   r  valid_kwargsmodel_input_namesrZ   r=  r  rP   rN   r  r  o  s`        JOO,L')9:H H H2
 2
 2
 2
 2
 2
rP   r  c                  2    e Zd ZdZddZdddZ	 	 dddZdS )IsaacProcessorz4Processor wrapper (tokenizer + IsaacImageProcessor).Nc                v    |                     dd          | _        |pt          |          | _        || _        d S )Nimage_token<image>)r&  rC  r  image_processorr   )rf   rE  r   r(  s       rN   rZ   zIsaacProcessor.__init__  s9    !::mY??.M2Ef2M2M"rP   rA   r   c                   i }| | j         j        |fi |}|d         }|                    |           |t          |t                    s|g}|                                }| j         j        dz  }d}t          t          |                    D ]}	| j	        ||	         v rY||         
                                |z  }
||	                             | j	        d|
z  d          ||	<   |dz  }| j	        ||	         v Y||	                             dd          ||	<   |!|                     | j        |fi |           t          |          S )Nr"  rl   r   z<|placeholder|>r;   <|image_pad|>)rE  r=  update
isinstancelistr   r   rangerG   rC  r   replacer   r   )rf   textr  r(  resultimage_inputsr"  merge_lengthindexinum_image_tokenss              rN   __call__zIsaacProcessor.__call__  s{   :4/:6LLVLLL)*:;NMM,'''!$-- " 6Dyy{{#3GJs4yy)) R RA*d1g55+9%+@+E+E+G+G<+W("&q'// ,.?BR.RTU# #Q 
 *d1g55 #1goo.?QQDGGMM.$.8888999F###rP   Fmessageslist[dict[str, Any]]tokenizebooladd_generation_promptr	   c                .   g }|D ]}d|v rt          |d         t                    rg }|d         D ]x}|                    d          dk    r*|                    |                    dd                     E|                    d          dk    r|                    | j                   y|                    dd          d                    |          d}	|                    |	           |                    |            | j        j        |f||d	|S )
NcontenttyperM   r   roleuser)r^  r[  )rW  rY  )rI  rJ  getr{   rC  joinr   apply_chat_template)
rf   rU  rW  rY  r(  processed_messagesmessage
text_partscontent_itemprocessed_messages
             rN   rb  z"IsaacProcessor.apply_chat_template  sZ      	3 	3GG##
793Et(L(L#
$+I$6 < <L#''//699")),*:*:62*F*FGGGG%))&11W<<"))$*:;;; $KK77!wwz22% %! #))*;<<<< #))'22221t~1
"7
 
 	
 
 	
rP   NN)rA   r   )FF)rU  rV  rW  rX  rY  rX  rA   r	   )r   r   r   __doc__rZ   rT  rb  r  rP   rN   rA  rA    si        >># # # #
$ $ $ $ $@ &+	#
 #
 #
 #
 #
 #
 #
rP   rA  c                  D    e Zd ZddZddZd ZddZdd
ZddZddZ	dS )IsaacProcessingInforA   r7   c                   t          | j        d          r| j                                        }t          t	          |dd           t	          |dd          t	          |dd          t	          |dd           t	          |dd	          t	          |d
d          t	          |dd          t	          |dd                     S t                      S )Nget_hf_configvision_configvideo_patch_sizer  r$  r   r%  r   r;   max_sequence_lengthi @  r   rD  vision_attn_implementation)rn  vision_patch_sizer$  r%  r   rp  r   rq  )hasattrctxrm  r7   getattr)rf   original_configs     rN   rm  z!IsaacProcessingInfo.get_hf_config  s    48_-- 	"h4466O%oMM")/;Mr"R"R'.#%=s( ( (/#%=t( ( %,O=RTU$V$V$+#%:E% % %_niPP+2#%A4, ,   & }}rP   rA  c                    |                                  }d|j        i}|                    |            | j        j        t
          fi |S )NrC  )rm  r   rH  rt  get_hf_processorrA  )rf   r(  	hf_configprocessor_kwargss       rN   rx  z$IsaacProcessingInfo.get_hf_processor  sW    &&((	91
 	'''(tx(LL;KLLLrP   c                    | j         j        S r   )rt  r   rf   s    rN   r4   z!IsaacProcessingInfo.get_tokenizer$  s    x!!rP   r+   c                    |                                  }t          dd|j        |j        |j        |j                  \  }}t          ||          S )Ni r  )r   r   )rm  r   ro  r$  r%  r   r+   )rf   ry  r   r   s       rN   !get_image_size_with_most_featuresz5IsaacProcessingInfo.get_image_size_with_most_features'  s_    &&((	&H&,%< ) ='
 '
 '
#| |MBBBBrP   r  c                &     | j         di |j        S Nr  )rx  rE  r'  s     rN   get_image_processorz'IsaacProcessingInfo.get_image_processor4  s    $t$..v..>>rP   Mapping[str, int | None]c                
    dd iS )Nr   r  r|  s    rN   get_supported_mm_limitsz+IsaacProcessingInfo.get_supported_mm_limits7  s    rP   r   ra   	mm_countsMapping[str, int]c                V    |                                  }|j        |j        dz  z  }d|iS )Nrl   r   )rm  r$  r   )rf   r   r  ry  num_vision_tokenss        rN   get_mm_max_tokens_per_itemz.IsaacProcessingInfo.get_mm_max_tokens_per_item:  s<    
 &&((	%<)1,
 *++rP   N)rA   r7   )rA   rA  )rA   r+   )rA   r  )rA   r  )r   ra   r  r  rA   r  )
r   r   r   rm  rx  r4   r~  r  r  r  r  rP   rN   rk  rk    s           2M M M M" " "C C C C? ? ? ?   	, 	, 	, 	, 	, 	,rP   rk  c                  "    e Zd ZddZ	 dddZdS )IsaacDummyInputsBuilderr  r  rA   r   c                x    |                     dd          }| j                                        }|j        }||z  S )Nr   r   )r`  inforx  rC  )rf   r  r  hf_processorrC  s        rN   get_dummy_textz&IsaacDummyInputsBuilder.get_dummy_textG  s;    ]]7A..
y1133'3Z''rP   Nr   ra   
mm_optionsMapping[str] | Noner'   c                    |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )Nr   r   )r   r   r  	overrides)r`  r  r~  _get_dummy_images)rf   r   r  r  r  r   r   image_overridess           rN   get_dummy_mm_dataz)IsaacDummyInputsBuilder.get_dummy_mm_dataO  s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
rP   )r  r  rA   r   r   )r   ra   r  r  r  r  rA   r'   )r   r   r   r  r  r  rP   rN   r  r  F  sF        ( ( ( ( +/	
 
 
 
 
 
 
rP   r  c                  (    e Zd ZU dZded<   ded<   dS )IsaacImagePixelInputsaR  
    Schema for validating Isaac image inputs.

    Dimensions:
        - np: Number of patches
        - d: Patch dimension
        - ni: Number of images

    The schema enforces:
        - pixel_values must be 2D: (num_patches, patch_dim)
        - image_grid_thw must be 2D: (num_images, 3)
          where 3 represents [T, H, W]
    z/Annotated[torch.Tensor, TensorShape('np', 'd')]r!  z-Annotated[torch.Tensor, TensorShape('ni', 3)]r"  N)r   r   r   ri  r  r  rP   rN   r  r  d  sH             
     rP   r  c                      e Zd ZddZddZdS )IsaacMultiModalProcessor	hf_inputsr   hf_processor_mm_kwargsMapping[str, object]rA   #Mapping[str, MultiModalFieldConfig]c                    |                     dt          j        d                    }|                    d          }t	          j        d|          t	          j        d          dS )Nr"  )r   r   rk   r   r0  )r`  rE   r3  r   r)   flat_from_sizesbatched)rf   r  r  r"  image_grid_sizess        rN   _get_mm_fields_configz.IsaacMultiModalProcessor._get_mm_fields_config  sm     #'7V9L9LMM)..r22 2A)  4;GDD	
 
 	
rP   mm_itemsr,   Mapping[str, Any]out_mm_kwargsr*   Sequence[PromptUpdate]c                     | j         j        d	i |}t          |dd          }|dz  d
fd}t          dd|          gS )Nr   rl   item_idxra   c                    d         |          }|d         j         }t          |t          j                  sJ t	          |                                          z  }d|z  }t          j        |d          S )Nr   r"  rG  )r1  rI  rE   Tensorra   r   r2   select_text)r  out_itemgrid_thwfeature_size	repl_fullrP  r  s        rN   get_replacement_isaaczKIsaacMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_isaac  so    $W-h7H 016Hh55555x}}//<?L',6I&29oNNNrP   r   rD  )modalitytargetreplacementr  )r  ra   )r  r  ru  r0   )rf   r  r  r  rE  r   r  rP  s      `   @rN   _get_prompt_updatesz,IsaacMultiModalProcessor._get_prompt_updates  s     8$)7QQ:PQQ%o7LaPP*A-	O 	O 	O 	O 	O 	O 	O   1  
 	
rP   N)r  r   r  r  rA   r  )r  r,   r  r  r  r*   rA   r  )r   r   r   r  r  r  rP   rN   r  r  ~  s<        
 
 
 
 
 
 
 
 
 
rP   r  c                  :     e Zd Z	 dddd fdZddZddZ xZS )Siglip2VisionAttentionNr]  prefixrS   r8   quant_configQuantizationConfig | Noner  r   rA   Nonec          
     h   t                                                       t                      }|rdnt          j                    | _        t          j                    | _        t          j	        |j
        |j                  | _        t          j	        |j        | j                  | _        t          |j
        | j        |j        |j        d|| d|          | _        t!          |j
        |j
        || d|          | _        t%          | j        | j        | j        dz  | d	          | _        d S )
Nr;   Tz	.qkv_proj)r[   	head_sizetotal_num_headstotal_num_kv_headsbiasr  r  
disable_tpz	.out_proj)rU   rV   r  r  r  g      z.attn)	num_headsr  r   r  )rY   rZ   r<   r   $get_tensor_model_parallel_world_sizetp_sizeget_tensor_model_parallel_ranktp_rank
dist_utilsdivider[   num_attention_headshidden_size_per_attention_head!num_attention_heads_per_partitionr   qkv_projr   out_projr   attn)rf   rS   r  r  use_data_parallelrg   s        rN   rZ   zSiglip2VisionAttention.__init__  sV    	466 !GAADFF 	
 &DFF.8.? :/
 /
+ 2<1B&2
 2
. **9"6%9%'''(	
 	
 	
 *)*%'''(
 
 
 '<95t;###	
 
 
			rP   qkvr>   tuple[torch.Tensor, ...]c                    |j         \  }}}|                    dd          \  }}}||| j        | j        ffd|||fD             \  }}}|||fS )Nr   rl   rr   c              3  ,   K   | ]} |j          V  d S r   )r   ).0r   	new_shapes     rN   	<genexpr>z3Siglip2VisionAttention.split_qkv.<locals>.<genexpr>  s,      99!6169%999999rP   )r  chunkr  r  )	rf   r  r   bsr  qkvr  s	           @rN   	split_qkvz Siglip2VisionAttention.split_qkv  sv    Q))A1)%%1a2/	
	 :9991ay9991a!QwrP   hidden_statesrL   rM   r   c                  |j         \  }}}|dk    rt          d          t          |d          }|                     |          \  }}|                     |          \  }}}	d |||	fD             \  }}}	|                     |||	||          }
t          |
d                                          }
|                     |
          \  }}t          |d          }|S )Nr;   z5packed variable-length attention expects batch_size=1zb s d -> s b dc              3  6   K   | ]}t          |d           V  dS )zs b h d -> b s h dNr
   )r  ts     rN   r  z1Siglip2VisionAttention.forward.<locals>.<genexpr>  s-      II!9Q 455IIIIIIrP   )querykeyvaluerL   rM   zb s h d -> s b (h d)zs b d -> b s d)r  r   r   r  r  r  r   r  )rf   r  rL   rM   
batch_sizer  r   r  r  r  context_layeroutputs               rN   r   zSiglip2VisionAttention.forward  s     ).
Aq??TUUUm%566}}Q1..##1aII1ayIII1a		!! " 
 
 "-1GHHSSUUMM-00	6#344rP   r   rS   r8   r  r  r  r   rA   r  )r  r>   rA   r  r  r>   rL   r>   rM   r   rA   r>   )r   r   r   rZ   r  r   r   r   s   @rN   r  r    s         37.

 .
 .
 .
 .
 .
 .
 .
 .
`
 
 
 
       rP   r  c                  2     e Zd Z	 dddd fdZddZ xZS )Siglip2EncoderLayerNr]  r  rS   r8   r  r  r  r   rA   r  c               \   t                                                       |j        | _        t	          j        | j        |j                  | _        t          ||| d          | _	        t	          j        | j        |j                  | _
        t          ||| d          | _        d S )Nr   z
.self_attnr  r  z.mlp)rY   rZ   r[   r\   rc   	LayerNormlayer_norm_epslayer_norm1r  	self_attnlayer_norm2r!   mlprf   rS   r  r  rg   s       rN   rZ   zSiglip2EncoderLayer.__init__  s     	+<F<QRRR/%(((
 
 

 <F<QRRR%???
 
 
rP   r  r>   rL   rM   r   c                   |}|                      |          }|                     |||          }||z   }|}|                     |          }|                     |          }||z   }|S )N)r  rL   rM   )r  r  r  r  )rf   r  rL   rM   residuals        rN   r   zSiglip2EncoderLayer.forward  s     !((77'!! ' 
 

 !=0 ((77// =0rP   r   r  r  r   r   r   rZ   r   r   r   s   @rN   r  r    sj         37

 
 
 
 
 
 
 
 
,       rP   r  c                  :     e Zd Z	 dddd fdZdddddZ xZS )Siglip2EncoderNr]  r  rS   r8   r  r  r  r   rA   r  c                   t                                                       | _        t          j        fdt          j                  D                       | _        d S )Nc           	     >    g | ]}t           d |           S )z.layers.r  )r  )r  	layer_idxrS   r  r  s     rN   
<listcomp>z+Siglip2Encoder.__init__.<locals>.<listcomp>A  sQ         $!-$99i99    rP   )rY   rZ   rS   rc   
ModuleListrK  num_hidden_layerslayersr  s    ```rN   rZ   zSiglip2Encoder.__init__7  sy     	m      "'v'?!@!@  	
 	
rP   rL   rM   inputs_embedsr>   rL   r   rM   c               :    |}| j         D ]} ||||          }|S )Nr  )r  )rf   r  rL   rM   r  encoder_layers         rN   r   zSiglip2Encoder.forwardK  sC     &![ 	 	M)M%%  MM
 rP   r   r  )r  r>   rL   r   rM   r   rA   r>   r  r   s   @rN   r  r  6  s|         37

 
 
 
 
 
 
 
 
0 +/*.         rP   r  c                  6     e Zd Z	 	 dd fd	ZddZddZ xZS )Siglip2VisionTransformerNr]  rS   r8   r  r  r  r   c                &   t                                                       || _        || _        |j        }t          |          | _        |j        | _        t          ||| d          | _	        t          j        ||j                  | _        d S )Nz.encoderr  r  )rY   rZ   rS   r  r[   rR   r   pixel_shuffle_scale_factorr  encoderrc   r  r  post_layernorm)rf   rS   r  r  r\   rg   s        rN   rZ   z!Siglip2VisionTransformer.__init__]  s     	(&	;FCC*0*K'%%&&&
 
 

 !l9&:OPPPrP   rh   rB   rA   r>   c                   |\  }}t          j        |d          }|                     |||f          }|                    d          }t	          ||j                  \  }}|                     |||          }|                     |          }| j        dk    rt          ||| j                  }|
                    d          }|S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width)
            of the input images.
        rk   rr   r   )r  rL   rM   r;   )r   r   r   )rE   r   r   rw   rO   r?   r
  r  r	  r   r   )rf   rh   r   r   r=   r  rL   rM   s           rN   r   z Siglip2VisionTransformer.forwardq  s     $6 [J{333	 i(MNN &//22!>}+"
 "

J '!! % 
 

 ++M::*Q..0'!<  M &--a00 rP   weights"Iterable[tuple[str, torch.Tensor]]set[str]c                |   g d}t          |                                           }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))r  q_projr  )r  k_projr  )r  v_projr  weight_loader)dictnamed_parameterssetrL  r  ru  r   add)rf   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  s               rN   load_weightsz%Siglip2VisionTransformer.load_weights  s    "
 "
 "
 4002233"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rP   Nr]  )rS   r8   r  r  r  r   rh   rB   rA   r>   r  r  rA   r  )r   r   r   rZ   r   r"  r   r   s   @rN   r  r  \  sz         37	Q Q Q Q Q Q Q(( ( ( (T       rP   r  c                  .     e Zd Z	 	 dd fdZddZ xZS )IsaacVisionEmbeddingNr]  
vision_cfgr8   
hidden_dimra   
output_dimr  r  r  r   c           
     n   t                                                       t          ||t          |d                    | _        t          |d|z  d|t          |d          d          | _        t          j                    | _	        t          d|z  |d|t          |d          d          | _        d S )N0r  r  F1)r  r  r  rW   3)rY   rZ   r  r%   transformerr   
linear_fc1rc   SiLUactr   
linear_fc2)rf   r(  r)  r*  r  r  rg   s         rN   rZ   zIsaacVisionEmbedding.__init__  s     	3%,,
 
 

 /
N%,,
 
 
 799+
N%,,
 
 
rP   rh   rB   rA   r>   c                    |                      |          }|                     |          }|                     |          }|                     |          }|S r   )r/  r0  r2  r3  )rf   rh   r  s      rN   r   zIsaacVisionEmbedding.forward  sQ     (();<<66//66rP   r#  )
r(  r8   r)  ra   r*  ra   r  r  r  r   r$  r  r   s   @rN   r'  r'    s_         37
 
 
 
 
 
 
@       rP   r'  )r  dummy_inputsc                       e Zd Zg dddgdZdZ eddddd	d
ddddd
          Zed@d            ZdddA fdZ	dBd!Z
dCd#ZdDd'ZdEd+ZdFd-Z	 	 dGdHd7ZdId9ZdJd=ZdKd?Z xZS )LIsaacForConditionalGeneration)r  r  r  	gate_projup_proj)r  gate_up_projTzlanguage_model.lm_head.zlanguage_model.model.zvision_embedding.transformerzvision_embedding.linear_fc1zvision_embedding.actvision_embedding.linear_fc2zvision_embedding.)
zlm_head.zmodel.text_model.lm_head.zmodel.text_model.zmodel.vision_embedding.0zmodel.vision_embedding.1zmodel.vision_embedding.2zmodel.vision_embedding.3zmodel.vision_embedding.zmodel.lm_head.zmodel.)orig_to_new_prefixr  r   rR  ra   rA   
str | Nonec                N    |                     d          rdS t          d          )Nr   rD  z Only image modality is supported)
startswithr   )clsr  rR  s      rN   get_placeholder_strz1IsaacForConditionalGeneration.get_placeholder_str  s,    w'' 	9;<<<rP   r   r  vllm_configr   r  c               N   t                                                       |j        j        }|j        }|| _        |j        }|dz  |dz  |dz  g}t          |j        |j                  | _	        | j	        |_
        t          |dd           }|t          |t                    s|n|}t          |dd           }	|	||u rt          |dd           }	t          |           |j        }
||
d<   |	 d|	v r|
                    d|	d                    |
|_        |                     |          5  t%          |dgt'          |d	          
          | _        d d d            n# 1 swxY w Y   | j        j        | _        |j        }|t/          d          |j        |j        nt          |dd           }|||_        |j        |j        dz  z  }|                     |d          5  t;          |||j        |t'          |d                    | _        d d d            d S # 1 swxY w Y   d S )Nr     text_configrope_scaling_rope_scalingmrope_sectionmrope_interleavedQwen3ForCausalLMlanguage_model)rB  architecturesr  z,IsaacConfig should always have vision_config_attn_implementationrl   r   vision_embedding)r(  r)  r*  r  r  )rY   rZ   r   ry  r  rS   head_dimr   r   vision_token_idimage_token_idru  rI  r  r6   rope_parameters
setdefault_mark_language_modelr$   r%   rK  make_empty_intermediate_tensorsrn  r   rq  rM  r[   r	  _mark_tower_modelr'  rN  )rf   rB  r  rS   r  rO  calculated_mrope_sectiontext_cfg
target_cfgrF  rR  r(  	attn_implr)  rg   s                 rN   rZ   z&IsaacForConditionalGeneration.__init__  s   )6@"/?MMM$
   8$f&9 
  
 !% 46=$77 #Jx,F,F# H 	 z>4@@J&$8$8"6?DAALj)))$4+C(#(;|(K(K&&#\2E%F   &5
"&&{33 	 	"<'12#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	, )
KLLL 0< --!7>> 	
  .7J++z/TVW/WX
##K99 	 	$8%%!-)#F,>??% % %D!	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s$   )'EE #E  -HH!Hinput_tokens	list[int]mm_featureslist[MultiModalFeatureSpec]Iterator[tuple[int, int, int]]c              #  B  K   | j         j        j        }t          |d           D ]x}|j        j        }|j        dk    rI|j        d         j                                        \  }}}|dk    sJ d|             |||z  ||z  fV  bt          d|j                   d S )Nc                    | j         j        S r   )mm_positionoffset)fs    rN   <lambda>z?IsaacForConditionalGeneration.iter_mm_grid_hw.<locals>.<lambda>Z  s    AM<P rP   )r  r   r"  r;   zImage must have 1 frame, got zUnsupported modality: )
rS   rn  r	  sortedrb  rc  r  r1  r   r   )	rf   r[  r]  spatial_merge_size
mm_featurerc  r  r   r   s	            rN   iter_mm_grid_hwz-IsaacForConditionalGeneration.iter_mm_grid_hwV  s       "[6Q 2P2PQQQ 	Q 	QJ+2F"g--$/*:;@GGII1aAvvvBqBBvvva#55q<N7NNNNNN !O*:M!O!OPPP	Q 	QrP   tuple[torch.Tensor, int]c                   g }d}|                      ||          D ]\  }}}||z
  }t          |          dk    r|d                                         dz   nd}	|                    t	          j        t	          j        |          d|f          |	z              t	          j        d||f                              dd          }
|
dd d f         |z   |	z   |
dd d f<   |                    |
           |||z  z   }|t          |          k     rwt          |          dk    r|d         d         dz   nd}	t          |          |z
  }|                    t	          j        t	          j        |          d|f          |	z              t	          j	        |d                              dd          }|                                dz   t          |          z
  
                                }t          j        |          |fS )Nr   rk   r;   r   )r   rk   )axis)ri  rG   rJ   r{   r   broadcast_tor   indicesru   concatenateitemrE   r   )rf   r[  r]  llm_pos_ids_liststrc  
llm_grid_h
llm_grid_wtext_lenst_idxgrid_indicesllm_positionsmrope_position_deltas                rN   get_mrope_input_positionsz7IsaacForConditionalGeneration.get_mrope_input_positionsc  s&   
 .2.B.B+/
 /
 	2 	2*FJ
 {H7:;K7L7Lq7P7P%b)--//!33VWF##	( 3 3a]CCfL   :q*j&ABBJJ1bQQL!-ad!3h!>!GLAAA##L111*z11BBL!!!!8;<L8M8MPQ8Q8Q%b)%0144WXF<((2-H##	( 3 3a]CCfL   '7a@@@HHBOO - 1 1 3 3a 7#l:K:K KQQSS..0DDDrP   r(  objectIsaacImagePixelInputs | Nonec                    |                     d          }|                     d          }||d S t          ||          S )Nr!  r"  r0  )r`  r  )rf   r(  r!  r"  s       rN   _parse_and_validate_image_inputz=IsaacForConditionalGeneration._parse_and_validate_image_input  sV     zz.11$455>#94 %%)
 
 
 	
rP   image_inputr  r  c                N   |d         }|d         }|                                 dk    rdS t          | j                                                  j        }| j        j        j        j        }|	                    ||          }|d d ddf         	                    |t          j                  }|                     ||f          }| j        j        j        }|                    d	          ||z  z  }	t!          |                    |	                                                    S )
Nr!  r"  r   r  r   r;   r   r   rk   )numelnextrK  
parametersr?   rN  r0  rt   rD   r   rE   rH   rS   rn  r	  r   tuplesplitr   )
rf   r  r!  r"  r?   rD   spatial_gridsvision_embeddings
merge_sizesizess
             rN   _process_image_inputz2IsaacForConditionalGeneration._process_image_input  s    #>2$%561$$2d)446677>%07=#fEBB&qqq!A#v.11&1LL 11<2OPP[.I
""2&&:
+BC&,,U\\^^<<===rP   MultiModalEmbeddings | Nonec                N     | j         di |}|dS |                     |          S r  )r~  r  )rf   r(  r  s      rN   embed_multimodalz.IsaacForConditionalGeneration.embed_multimodal  s9    :d:DDVDD2((555rP   N	input_idsr>   	positionsintermediate_tensorsIntermediateTensors | Noner  r   "torch.Tensor | IntermediateTensorsc                &     | j         d||||d|S )N)r  r  r  r  r  )rK  )rf   r  r  r  r  r(  s         rN   r   z%IsaacForConditionalGeneration.forward  s;     #t" 
!5'	
 

 
 
 	
rP   r  c                6    | j                             |          S r   )rK  compute_logits)rf   r  s     rN   r  z,IsaacForConditionalGeneration.compute_logits  s    "11-@@@rP   r  r  r  c                X    t          |           }|                    || j                  S )N)mapper)r"   r"  hf_to_vllm_mapper)rf   r  loaders      rN   r"  z*IsaacForConditionalGeneration.load_weights  s+    "4((""743I"JJJrP   r    c                0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        rK  r;  rN  )rK  	connectortower_model)r    from_string_fieldr|  s    rN   get_mm_mappingz,IsaacForConditionalGeneration.get_mm_mapping  s'     /+3*
 
 
 	
rP   )r  r   rR  ra   rA   r=  )rB  r   r  r   )r[  r\  r]  r^  rA   r_  )r[  r\  r]  r^  rA   rj  )r(  r{  rA   r|  )r  r  rA   r  )r(  r{  rA   r  rh  )r  r>   r  r>   r  r  r  r   r(  r{  rA   r  )r  r>   rA   r   r%  )rA   r    )r   r   r   packed_modules_mappingsupports_encoder_tp_datar#   r  classmethodrA  rZ   ri  rz  r~  r  r  r   r  r"  r  r   r   s   @rN   r7  r7    s       
 
 
 

 
  $ &1)B!8(F(E(>(E':7-
 
   = = = [= BI E E E E E E E ENQ Q Q QE E E EB
 
 
 
> > > >&6 6 6 6 <@-1
 
 
 
 
 A A A AK K K K
 
 
 
 
 
 
 
rP   r7  )r=   r>   r?   r@   rA   rB   )r;   N)
r=   r>   r   r>   r   ra   r?   r   rA   r>   )r;   )r   r>   r   r>   r   ra   rA   r>   )r   r   rA   r   )r   r   rA   r   )Nr   r;   )r   ra   r   ra   r]   ra   r   ra   r   r   r   r   r   ra   rA   r   )r   r   r   r   rA   ra   )r   r>   r   r   rA   r>   )r   r>   r]   ra   rA   r>   )Nr;   )r  r>   r]   ra   r   ra   r   r   r   ra   rA   r  )
__future__r   r   collections.abcr   r   r   r   typingr   r	   numpyr   	PIL.ImagePILrE   torch.nnrc   torch.nn.functional
functionalrx   einopsr   #transformers.image_processing_utilsr   transformers.tokenization_utilsr   typing_extensionsr   r   vllm.configr   vllm.config.modelr   vllm.distributedr   r   r  9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r   r   r   r   )vllm.model_executor.models.module_mappingr    !vllm.model_executor.models.siglipr!    vllm.model_executor.models.utilsr"   r#   r$   r%   vllm.multimodalr&   vllm.multimodal.inputsr'   r(   r)   r*   vllm.multimodal.parser+   r,   vllm.multimodal.processingr-   r.   r/   r0   r1   r2   vllm.sequencer3   vllm.tokenizersr4   vllm.tokenizers.hfr5   vllm.transformers_utils.configr6   vllm.transformers_utils.configsr7   r8   vllm.utils.tensor_schemar9   r:   visionr<   rO   ModulerR   r   r   r   VISION_MEAN
VISION_STDVISION_SCALEr   r   r   rK   float32r   r   r   r   r  r  r  r  r  rA  rk  r  r  r  r  r  r  r  r'  register_processorr7  r  rP   rN   <module>r     s;   # " " " " "  A A A A A A A A A A A A ! ! ! ! ! ! ! !                               < < < < < < 6 6 6 6 6 6 / / / / / / / / " " " " " " ) ) ) ) ) ) + + + + + + 0 0 0 0 0 0 X X X X X X            G F F F F F                   E D D D D D 7 7 7 7 7 7            0 / / / / /            A @ @ @ @ @ @ @                . - - - - - ) ) ) ) ) ) 3 3 3 3 3 3 @ @ @ @ @ @        ? > > > > > > > , , , , , ," " " "P P P P P	 P P Pl "&	? ? ? ? ?J = = = = =H 
 
   &! ! ! !  #' f+ f+ f+ f+ f+R u|Ku}===BB1aBOOel:U];;;@@Aq"MM
G 
G 
G 
G      8& & & &Z #' K! K! K! K! K!\    	    E
 E
 E
 E
 E
 E
 E
 E
PH
 H
 H
 H
 H
 H
 H
 H
VA, A, A, A, A,, A, A, A,H
 
 
 
 
45HI 
 
 
<    L   4+
 +
 +
 +
 +
6 +
 +
 +
\W W W W WRY W W Wt- - - - -") - - -`# # # # #RY # # #LX X X X Xry X X Xv( ( ( ( (29 ( ( (V ('	(  
c
 c
 c
 c
 c
I!<]c
 c
 
c
 c
 c
rP   