
    .`iu              
          U d Z ddlZddlmZmZmZmZmZ ddlm	Z	 ddl
mZmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl&m)Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZU ddlVmWZW ddlXmYZY dd lZm[Z[m\Z\ dd!l]m^Z^ d"d#l_m`Z`maZambZbmcZcmdZd d"d$l)meZemfZfmgZgmhZh d"d%limjZjmkZkmlZl  e,em          Znd&Zo G d' d(e[          Zp G d) d*e[          Zqepeqz  Zreesd+<    G d, d-e[          Zt G d. d/e[          Zueteuz  Zveesd0<    G d1 d2ejw                  Zx G d3 d4ejw                  Zy G d5 d6ejw                  Zz G d7 d8ejw                  Z{ G d9 d:ejw                  Z| G d; d<ejw                  Z}d=e~d>eeeej        f         geeeGf         f         fd?Z G d@ dAeO          Z G dB dCeS          Z G dD dEeQe                   Z G dF dGeRe                   Z eAj        eeeH           G dI dJejw        eceaedeb                      Z G dK dLe          Z G dM dNe          Z G dO dPe          Z G dQ dRe          Z eAj        eeeH           G dS dTe                      ZdS )UzBInference-only Qwen2-VL model compatible with HuggingFace weights.    N)CallableIterableIteratorMappingSequence)partial)	AnnotatedAnyLiteral	TypeAlias	rearrange)BatchFeature)Qwen2VLImageProcessorQwen2VLProcessor)Qwen2VLConfigQwen2VLVisionConfig)smart_resize)Qwen2VLVideoProcessor)
VllmConfig)BaseDummyOptions)parallel_state tensor_model_parallel_all_gather)utils)init_logger)	QuickGELU)MMEncoderAttention)Conv3dLayer)ColumnParallelLinearRowParallelLinear)QuantizationConfig)get_rope)ApplyRotaryEmb)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)	ImageItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems	VideoItem)DictEmbeddingItems	ImageSizeModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TokenizerLike)TensorSchemaTensorShape)AttentionBackendEnum   )MultiModalEmbeddingsSupportsLoRASupportsMRoPESupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vit_attn_backendis_vit_use_data_parallel!run_dp_sharded_mrope_vision_model   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
Qwen2VLImagePixelInputsa  
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size

    Historical context:
        - pixel_values shape: (num_patches, num_channels * patch_size *
          patch_size)
        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
          format
    pixel_valuestypenpcpsni   image_grid_thwN
__name__
__module____qualname____doc__r   __annotations__r	   torchTensorr;        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/qwen2_vl.pyrL   rL   y   s           .
!!!!D%  	"   
 D!	     r]   rL   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
Qwen2VLImageEmbeddingInputsa  
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size
        - ni: Number of images

    Historical context:
        - image_embeds shape: (num_image_features, hidden_size)
        - num_image_features varies based on the number and resolution of the
          images.
        - hidden_size must match the hidden size of language model backbone.
        - image_grid_thw shape: (num_images, 3) in (grid_t, grid_h, grid_w)
          format
    image_embedsrN   nfhsrQ   rR   rS   NrT   r\   r]   r^   r`   r`                 .
!!!!D$	!   
 D!	     r]   r`   Qwen2VLImageInputsc                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
Qwen2VLVideoPixelInputsa  
    Dimensions:
        - np: The total number of patches over each video over each prompt in
              the batch
        - ctps: Number of channels * temporal_patch_size * patch_size *
          patch_size
        - nv: Number of videos

    Historical context:
        - pixel_values_videos shape: (num_patches, num_channels *
          temporal_patch_size * patch_size * patch_size)
        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
          format
    pixel_values_videosrN   rO   ctpsnvrR   video_grid_thwNrT   r\   r]   r^   rg   rg      s           '
(((("D&!!	#   
 D!	     r]   rg   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
Qwen2VLVideoEmbeddingInputsa  
    Dimensions:
        - nf: Number of video features
        - hs: Hidden size
        - nv: Number of videos

    Historical context:
        - video_embeds shape: (num_video_features, hidden_size)
        - num_video_features varies based on the number and resolution of the
          videos.
        - hidden_size must match the hidden size of language model backbone.
        - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
          format
    video_embedsrN   rb   rc   rj   rR   rk   NrT   r\   r]   r^   rm   rm      rd   r]   rm   Qwen2VLVideoInputsc                   |     e Zd Zeddfdededeej                 dedz  de	f
 fdZ
d	ej        d
ej        fdZ xZS )Qwen2VisionMLPN in_featureshidden_features	act_layerquant_configprefixc                     t                                                       t                      }t          |||| d|          | _         |            | _        t          |||| d|          | _        d S )Nz.fc1)rv   rw   
disable_tpz.fc2)super__init__rH   r   fc1actr    fc2)selfrs   rt   ru   rv   rw   use_data_parallel	__class__s          r^   r{   zQwen2VisionMLP.__init__   s     	466'%???(
 
 
 9;;$%???(
 
 
r]   xreturnc                     |                      |          \  }}|                     |          }|                     |          \  }}|S N)r|   r}   r~   )r   r   
x_parallel_s       r^   forwardzQwen2VisionMLP.forward  s@    
AXXj))
xx
##1r]   )rU   rV   rW   r   intrN   nnModuler!   strr{   rZ   r[   r   __classcell__r   s   @r^   rq   rq      s        
 &/26
 

 
 	?	

 )4/
 
 
 
 
 
 
4 %,        r]   rq   c                        e Zd Z	 	 ddededededz  deddf fd	Zd
ej        de	ej        df         fdZ
	 ddej        dej        dej        dej        dedz  dej        fdZ xZS )Qwen2VisionAttentionNrr   	embed_dim	num_headsprojection_sizerv   rw   r   c                 0   t                                                       t                      }|rdnt          j                    | _        t          j                    | _        t          j	        ||          | _
        t          j	        || j                  | _        t          |d|z  || d|          | _        t          |||| d|          | _        t!          | j        | j
        | j
        dz            | _        t%          d	          | _        d S )
Nr=   rR   z.qkv)
input_sizeoutput_sizerv   rw   ry   z.projg      )r   	head_sizescaleT)enforce_enable)rz   r{   rH   r   $get_tensor_model_parallel_world_sizetp_sizeget_tensor_model_parallel_ranktp_rank
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr    projr   attnr#   apply_rotary_emb)r   r   r   r   rv   rw   r   r   s          r^   r{   zQwen2VisionAttention.__init__  s?    	466 !GAADFF 	
 &DFF.8.?Y/
 /
+ 2<1Bt|2
 2
. ( O+%???(
 
 
 &&!%###(
 
 
	 '<95t;
 
 
	 !/d C C Cr]   r   .c                   	 |j         \  }}}| j        dk    rt          |          }|                    dd          \  }}}| j        dk    rbt	          t
          j        | j                  } ||          | j                 } ||          | j                 } ||          | j                 }||| j        | j	        f		fd|||fD             \  }}}|||fS )Nr=   rR      dim)num_partitionsc              3   ,   K   | ]} |j          V  d S r   )view).0r   	new_shapes     r^   	<genexpr>z1Qwen2VisionAttention.split_qkv.<locals>.<genexpr>_  s,      99!6169%999999r]   )
shaper   r   chunkr   r   split_tensor_along_last_dimr   r   r   )
r   r   seq_lenbsr   qkvsplitterr   s
            @r^   	split_qkvzQwen2VisionAttention.split_qkvF  s    Q<!2377C ))A1)%%1a <!6t|  H DL)ADL)ADL)A 2/	
	 :9991ay9991a!Qwr]   r   
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                    |                      |          \  }}|                     |          \  }}}	d |||	fD             \  }}}	t          j        ||gd          }
|                     |
||          }t          j        |dd          \  }}|                     |||	||          }t          |d                                          }| 	                    |          \  }}|S )Nc              3   6   K   | ]}t          |d           V  dS )zs b ... -> b s ...Nr   )r   r   s     r^   r   z/Qwen2VisionAttention.forward.<locals>.<genexpr>p  s-      II!9Q 455IIIIIIr]   r   r   r   )querykeyvaluer   r   zb s h d -> s b (h d))
r   r   rZ   catr   r   r   r   
contiguousr   )r   r   r   r   r   r   r   r   r   r   	qk_concat
qk_rotatedcontext_layeroutputs                 r^   r   zQwen2VisionAttention.forwardb  s    xx{{1 ..##1aII1ayIII1a Iq!f!,,,	**
 


 {:qa0001		!! " 
 
 "-1GHHSSUUIIm,,	r]   )Nrr   r   )rU   rV   rW   r   r!   r   r{   rZ   r[   tupler   r   r   r   s   @r^   r   r     s.        37-D -D-D -D 	-D
 )4/-D -D 
-D -D -D -D -D -D^U\ eEL#4E.F    D "&$ $<$ L$ "L	$
 "L$ $J$ 
$ $ $ $ $ $ $ $r]   r   c                        e Zd Zedddfdedededeej                 de	egej        f         dz  de
dz  d	ed
df fdZ	 ddej        dej        dej        dej        dedz  d
ej        fdZ xZS )Qwen2VisionBlockNrr   r   r   	mlp_ratioru   
norm_layerrv   rw   r   c                 V   t                                                       |t          t          j        d          } ||          | _         ||          | _        t          ||z            }t          ||||| d          | _	        t          ||||| d          | _        d S )Nư>epsz.attn)r   r   r   rv   rw   z.mlp)ru   rv   rw   )rz   r{   r   r   	LayerNormnorm1norm2r   r   r   rq   mlp)
r   r   r   r   ru   r   rv   rw   mlp_hidden_dimr   s
            r^   r{   zQwen2VisionBlock.__init__  s     	 4888JZ__
Z__
S9_--(%###
 
 
	 "%???
 
 
r]   r   r   r   r   r   c                     ||                      |                     |          ||||          z   }||                     |                     |                    z   }|S )Nr   r   r   r   )r   r   r   r   )r   r   r   r   r   r   s         r^   r   zQwen2VisionBlock.forward  sc     		JJqMM!11!  
 
 
 A'''r]   r   )rU   rV   rW   r   r   floatrN   r   r   r   r!   r   r{   rZ   r[   r   r   r   s   @r^   r   r     s#        &/8<26
 

 
 	

 	?
 cUBI-.5
 )4/
 
 

 
 
 
 
 
L "& < L "L	
 "L $J 
       r]   r   c                   b     e Zd Z	 	 	 	 ddedededed	d
f
 fdZdej        d	ej        fdZ xZS )Qwen2VisionPatchEmbedrJ   r   rR     
patch_sizetemporal_patch_sizein_channelsr   r   Nc                     t                                                       || _        || _        || _        |||f}t          ||||d          | _        d S )NF)kernel_sizestridebias)rz   r{   r   r   r   r   r   )r   r   r   r   r   r   r   s         r^   r{   zQwen2VisionPatchEmbed.__init__  sf     	$#6 "*J
C#
 
 
			r]   r   c                     |j         \  }}|                    |d| j        | j        | j                  }|                     |                              || j                  }|S N)r   r   r   r   r   r   )r   r   LCs       r^   r   zQwen2VisionPatchEmbed.forward  sS    w1FF1b$2DOT_UUIIaLLa00r]   )rJ   r   rR   r   )	rU   rV   rW   r   r{   rZ   r[   r   r   r   s   @r^   r   r     s         #$
 

 !
 	

 
 

 
 
 
 
 
* %,        r]   r   c                        e Zd Z	 	 	 	 ddededeegej        f         dz  dededz  d	ed
df fdZ	de
j        d
e
j        fdZ xZS )Qwen2VisionPatchMergerNr   rr   d_modelcontext_dimr   spatial_merge_sizerv   rw   r   c                    t                                                       t                      }||dz  z  | _        |t	          t
          j        d          } ||          | _        t          j        t          | j        | j        d|| d|          t          j
                    t          | j        |d|| d|          g          | _        d S )Nr   r   r   Tz.mlp.0)r   rv   rw   ry   z.mlp.2)rz   r{   rH   hidden_sizer   r   r   ln_q
ModuleListr   GELUr    r   )	r   r   r   r   r   rv   rw   r   r   s	           r^   r{   zQwen2VisionPatchMerger.__init__  s     	466&*<a*?@ 4888JJ{++	=$$$!-$,,,0   		!$!-$,,,0  
 
r]   r   c                     |                      |          }|                    d| j                  }| j        \  }}} ||          \  }} ||          } ||          \  }}|S r   )r   r   r   r   )r   r   mlp_fc1mlp_actmlp_fc2r   r   outs           r^   r   zQwen2VisionPatchMerger.forward  sm    IIaLLFF2t'(($(H!'


AWZ((
$$Q
r]   )Nr   Nrr   )rU   rV   rW   r   r   r   r   r!   r   r{   rZ   r[   r   r   r   s   @r^   r   r     s        
 9="#26#
 #
#
 #
 cUBI-.5	#

  #
 )4/#
 #
 
#
 #
 #
 #
 #
 #
J %,        r]   r   c                       e Zd Z	 	 	 ddedededz  deddf
 fd	Zede	j
        fd
            Z
ede	j        fd            Zdeee                  dee	j        e	j        f         fdZde	j        dedz  fdZde	j        de	j        eee                  z  de	j        fdZdeeee	j        f                  dee         fdZ xZS )Qwen2VisionTransformerr   Nrr   vision_confignorm_epsrv   rw   r   c                    t                                                       |j        }|j        }|j        }|j        }|j        }	|j        |j        }
|j	        |j
        t                      | _        |j        | _        || _        | _	        | _        t          |||          | _        t!          t"          j        |          z  }t'          |ddddi          | _        t#          j        fdt-          |
          D                       | _        t1          |	 d	
          | _        t5          |t7          j                              | _        d S )N)r   r   r   r   r   i    Tpartial_rotary_factorg      ?)r   max_positionis_neox_stylerope_parametersc                 D    g | ]}t           d |           S )z.blocks.)r   r   r   r   rv   rw   )r   )r   	layer_idxr   r   r   r   rw   rv   s     r^   
<listcomp>z3Qwen2VisionTransformer.__init__.<locals>.<listcomp>8  sZ     
 
 
  !!'')!-$99i99  
 
 
r]   z.merger)r   r   r   rv   rw   )r   dtype)rz   r{   r   r   r   r   r   r   depthr   r   rH   r   out_hidden_sizer   patch_embedr   r   r   r"   rotary_pos_embr   rangeblocksr   mergerrG   rZ   get_default_dtypeattn_backend)r   r   r   rv   rw   r   r   r   r   r   r  head_dimr   r   r   r   r   s      ``       @@@@r^   r{   zQwen2VisionTransformer.__init__  s    	"-
+?*=#/#/!+	#!+	!+	!9!;!;,8"4""0! 3#	
 
 
 R\x888
	)&4c:	
 
 
 m
 
 
 
 
 
 
 
 
 "'u
 
 

 
 -!!%%%%
 
 
 1)++
 
 
r]   c                 .    | j         j        j        j        S r   )r  r   weightr  r   s    r^   r  zQwen2VisionTransformer.dtypeP  s    $+11r]   c                 .    | j         j        j        j        S r   )r  r   r  devicer  s    r^   r  zQwen2VisionTransformer.deviceT  s    $+22r]   grid_thwc                    g }d}|D ]\  }}}t          j        |                              d                              d|          }t          j        |                              d                              |d          }|                    || j        z  | j        || j        z  | j                                      dddd                                          }|                    || j        z  | j        || j        z  | j                                      dddd                                          }|                    t          j	        ||gd          
                    |d                     t          |||          }t          j        |d          }| j                            |          \  }	}
|	|                             d          }|
|                             d          }||fS )Nr   r=   r   r   rR   r   )rZ   arange	unsqueezeexpandreshaper   permuteflattenappendstackrepeatmaxr   r  get_cos_sin)r   r  pos_idsmax_grid_sizethwhpos_idswpos_idscossincos_combinedsin_combineds                r^   rot_pos_embz"Qwen2VisionTransformer.rot_pos_embX  s     	5 	5GAq!|A0033::2qAAH|A0033::1bAAH  00+00+	  Aq!$$    00+00+	  Aq!$$  NN5;(';DDDKKAqQQRRRq!44MM)G+++ &22=AAS7|++A..7|++A..\))r]   r   c                     d }| j         t          j        t          j        hv r'|dd          |d d         z
                                  }|S )Nr=   r   )r  r<   
FLASH_ATTNROCM_AITER_FAr  )r   r   r   s      r^   compute_attn_mask_seqlenz/Qwen2VisionTransformer.compute_attn_mask_seqlen  sV    
 + .!
 
 
 %QRR.:crc?:??AAJr]   r   c                    |                     | j        | j                  }|                     |          }t	          |t
                    r#|}t          j        |t          j                  }n(|	                                }|
                                }|                     |          \  }}t          j        |d d df         |d d df         z  |d d df                                       dt          j                  }t          j        t          j        dt          j                  |g          }t!          j        |          }|                    d          }|                     |          }|                     | j        d          }| j        D ]} ||||||	          }|                     |          }|S )
N)r  r  )r  r=   r   r   )axisr  T)non_blockingr   )tor  r  r  
isinstancelistrO   arrayint32tolistnumpyr,  r  cumsumconcatenatezerosrZ   
from_numpyr  r0  r
  r  )	r   r   r  grid_thw_listr   r   r   r   blks	            r^   r   zQwen2VisionTransformer.forward  s    DD4:D66Qh%% 	($Mx999HH$OO--M~~''H 261A1A-1P1P.. Yx1~A>AOOVV"( W 
 

 ^RXarx%@%@%@*$MNN
%j11
 KKNN 22:>>
]]4;T]BB
; 	 	C%#5#5%  AA KKNNr]   weightsc                    g d}t          |                     d                    }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))qkv_projq_projr   )rC  k_projr   )rC  v_projr   F)remove_duplicateweight_loader)dictnamed_parameterssetreplacerH  getattrr$   add)r   rA  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrH  s               r^   load_weightsz#Qwen2VisionTransformer.load_weights  s    "
 "
 "
 400%0HHII"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####r]   )r   Nrr   )rU   rV   rW   r   r   r!   r   r{   propertyrZ   r  r  r6  r   r   r[   r,  r0  r   r   rK  rX  r   r   s   @r^   r   r     s        26A
 A
*A
 A
 )4/	A

 A
 
A
 A
 A
 A
 A
 A
F 2u{ 2 2 2 X2 3 3 3 3 X3%*T#Y%*	u|U\)	*%* %* %* %*N5< C$J    ,<, ,d3i0, 
	, , , ,\HU33D-E$F 3s8        r]   r   r   r   c                 P     dt           t          t          j        f         f fd}|S )N	hf_inputsc                 *   |                      dt          j        d                    }|                    d          }|z  z  }|                      dt          j        d                    }|                    d          }|z  z  }t	          t          j        d|          t          j        d|          t          j        dd          t          j        d|          t          j        d|          t          j        dd          	          S )
NrS   )r   rR   r   rk   imageT)keep_on_cpuvideo)rM   ra   rS   rh   rn   rk   )getrZ   emptyprodrI  r+   flat_from_sizesbatched)r[  rS   image_pixel_grid_sizesimage_embed_grid_sizesrk   video_grid_sizesvideo_embed_grid_sizesr   s          r^   _qwen2vl_field_configz<_create_qwen2vl_field_factory.<locals>._qwen2vl_field_config  s0   "'7V9L9LMM!/!4!4R!8!8"&88<NN 	 #'7V9L9LMM)..r22 226HH 	 .>/  />/  18dSSS 5 E)! ! />/  18dSSS
 
 
 	
r]   )r   r   rZ   r[   )r   ri  s   ` r^   _create_qwen2vl_field_factoryrj    s<    
el1B)C 
 
 
 
 
 
< ! r]   c                        e Zd Zdef fdZdeeej        f         e	e
         z  deeef         dz  f fdZdeeej        f         e	e         z  deeef         dz  f fdZ xZS )Qwen2VLMultiModalDataParserr   c                 H    || _          t                      j        |i | d S r   )_spatial_merge_sizerz   r{   )r   r   argskwargsr   s       r^   r{   z$Qwen2VLMultiModalDataParser.__init__  s-    #5 $)&)))))r]   datar   Nc                     t          |t                    r't          |dddht          | j                            S t                                          |          S )Nr]  ra   rS   modalityrequired_fieldsfields_factory)r5  rI  r.   rj  rn  rz   _parse_image_datar   rq  r   s     r^   rw  z-Qwen2VLMultiModalDataParser._parse_image_data  e     dD!! 	% !/1A B<T=UVV	    ww((...r]   c                     t          |t                    r't          |dddht          | j                            S t                                          |          S )Nr_  rn   rk   rs  )r5  rI  r.   rj  rn  rz   _parse_video_datarx  s     r^   r{  z-Qwen2VLMultiModalDataParser._parse_video_data  ry  r]   )rU   rV   rW   r   r{   rI  r   rZ   r[   r(   r'   r0   r
   rw  r-   r{  r   r   s   @r^   rl  rl    s        *3 * * * * * */3$%Y(??/ 
38	$t	+/ / / / / //3$%Y(??/ 
38	$t	+/ / / / / / / / / /r]   rl  c                      e Zd Zd ZdedefdZdedefdZde	e
edz  f         fdZded	e	e
ef         de	e
ef         fd
Zddddedededededz  deeef         fdZdedededz  defdZdededededz  def
dZ	 ddedz  defdZdefdZd dededefdZefded	e	e
ef         dedefdZded	e	e
ef         defdZdS )!Qwen2VLProcessingInfoc                 @    | j                             t                    S r   )ctxget_hf_configr   r  s    r^   r  z#Qwen2VLProcessingInfo.get_hf_config  s    x%%m444r]   rp  r   c                 ^     | j         j        t          fd|                    dd          i|S )Nuse_fastT)r  get_hf_processorr   popr   rp  s     r^   r  z&Qwen2VLProcessingInfo.get_hf_processor  sC    (tx(
 
ZZ
D11
 
 
 	
r]   c                 &     | j         di |j        S Nr\   )r  image_processorr  s     r^   get_image_processorz)Qwen2VLProcessingInfo.get_image_processor&  s    $t$..v..>>r]   Nc                     d d dS Nr]  r_  r\   r  s    r^   get_supported_mm_limitsz-Qwen2VLProcessingInfo.get_supported_mm_limits)  s    ---r]   r   	mm_countsc                 `    |                                  }|                     ||          }||dS r  )get_max_image_tokensget_max_video_tokens)r   r   r  max_image_tokensmax_video_tokenss        r^   get_mm_max_tokens_per_itemz0Qwen2VLProcessingInfo.get_mm_max_tokens_per_item,  s;    
  446644WiHH)4DEEEr]   r=   T)
num_frames	do_resizeimage_widthimage_heightr  r  r  c                   ||                                  }|                                 }|j        }|j        }|j        }	|j        }
|r6t          ||||	z  |j        |j                  \  }}t          ||          }nt          ||          }|||
z  z   }t          ||
z  d          }|j        |z  }|j        |z  }||z  |z  }||	dz  z  }||fS )N)heightwidthfactor
min_pixels
max_pixelsr  r  r=   r   )r  r  r   r   r   r   r   r  r  r/   r  r  r  )r   r  r  r  r  r  	hf_configr   r   
merge_sizer   resized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokenss                       r^   _get_vision_infoz&Qwen2VLProcessingInfo._get_vision_info5  s     ""6688O&&((	!/"-
"5
+? 
	R,8#!!J.*5*5- - -)NM !*n U U U )L Q Q Q '6I)II&*==qAA")Z7"(J6vo.'JM: "333r]   c                >    |                      ||d|          \  }}|S )Nr=   r  r  r  r  r  )r   r  r  r  r   num_image_tokenss         r^   get_num_image_tokensz*Qwen2VLProcessingInfo.get_num_image_tokens`  s8     #33#%+	 4 
 
  r]   c                >    |                      ||||          \  }}|S Nr  r  )r   r  r  r  r  r   num_video_tokenss          r^   get_num_video_tokensz*Qwen2VLProcessingInfo.get_num_video_tokenso  s8     #33#%!+	 4 
 
  r]   r  c                    |                                  }|j        }|j        }|j        }|(|                                 }|j        p|j        d         }||z  }|||z  z  }dt          dt          t          t          f         fd}	d|}}
t          |dd          D ]} |	|          \  }
}||
z  dk    r nt          ||z  ||
z  	          S )
Nlongest_edgenr   c                 z    t          t          j        |           dd          D ]}| |z  dk    r	|| |z  fc S d| fS )Nr   r   r=   )r	  mathisqrt)r  ds     r^   closest_factor_pairzTQwen2VLProcessingInfo.get_image_size_with_most_features.<locals>.closest_factor_pair  sT    4:a==!R00 % %q5A::a1f9$$$ a4Kr]   r=   r   r      r  )r  r   r   r   r  r  sizer   r   r	  r/   )r   r  r  r   r   r  r  unitmax_seq_lenr  height_factorwidth_factorr   s                r^   !get_image_size_with_most_featuresz7Qwen2VLProcessingInfo.get_image_size_with_most_features  s     &&((	!/"-
"5
"6688O*Ro.B>.R  J& TD[1	3 	5c? 	 	 	 	 '(|[!R00 	 	G*=*=g*F*F'M<m+s22 3 tl24-;OPPPPr]   c                 `    |                                  \  }}|                     ||d           S )N)r  r  r  )r  r  )r   target_widthtarget_heights      r^   r  z*Qwen2VLProcessingInfo.get_max_image_tokens  s>    &*&L&L&N&N#m(($&  ) 
 
 	
r]   
max_tokensstart_num_framesc                     |                                  \  }}|}	 |dz   }|                     |||d           }||k    rn|}(|S )NTr=   r  )r  r  )r   r  r  r  r  r  next_num_framesnext_max_tokenss           r^   _get_max_video_framesz+Qwen2VLProcessingInfo._get_max_video_frames  ss    &*&L&L&N&N#m%
	)(1nO"77(** $	 8  O ++(J	) r]   max_frames_per_videoc                     |                     dd          }|                     |          }t          |t          |d          z  |          }t          |d          S )Nr_  r   r=   )r`  r  minr  )r   r   r  r  
max_videosmax_total_framess         r^   !get_num_frames_with_most_featuresz7Qwen2VLProcessingInfo.get_num_frames_with_most_features  sc     ]]7A..
55g>>"J 2 224H 
  
 '+++r]   c                     |                                  \  }}|                     |||                     ||          d           S r  )r  r  r  )r   r   r  r  r  s        r^   r  z*Qwen2VLProcessingInfo.get_max_video_tokens  sS    
 '+&L&L&N&N#m(($&==gyQQ 	 ) 
 
 	
r]   r   )r=   )rU   rV   rW   r  objectr   r  r   r  r   r   r   r  r  boolr   r/   r  r  r  r  r  r  _MAX_FRAMES_PER_VIDEOr  r  r\   r]   r^   r}  r}    s       5 5 5
 
4D 
 
 
 
?F ?7L ? ? ? ?.cDj)A . . . .FF 38$F 
c		F F F F )4 )4 )4 )4 	)4
 )4 )4 /5)4 
y#~	)4 )4 )4 )4V    	 
 /5  
           	 
   /5  
       " (,)Q )Q*)Q	)Q )Q )Q )QV
c 
 
 
 
  s SV    2 %:	, ,, 38$, "	,
 
, , , ,

 38$
 
	
 
 
 
 
 
r]   r}  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Qwen2VLDummyInputsBuilderr  r   c                     |                     dd          }|                     dd          }| j                                        }|j        }|j        }||z  ||z  z   S )Nr]  r   r_  )r`  infor  image_tokenvideo_token)r   r  
num_images
num_videoshf_processorr  r  s          r^   get_dummy_textz(Qwen2VLDummyInputsBuilder.get_dummy_text  s`    ]]7A..
]]7A..
y1133'3'3Z'+
*BBBr]   Nr   
mm_optionsc                    |                     dd          }|                     dd          }| j                                        \  }}| j                            ||          }|r|                     d          nd }	|r|                     d          nd }
|                     ||||	          |                     |||||
          dS )Nr]  r   r_  )r  r  r  	overrides)r  r  r  r  r  r  )r`  r  r  r  _get_dummy_images_get_dummy_videos)r   r   r  r  r  r  r  r  target_num_framesimage_overridesvideo_overridess              r^   get_dummy_mm_dataz+Qwen2VLDummyInputsBuilder.get_dummy_mm_data  s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m IGGY
 
 6@I*..111T5?I*..111T ++"$%)	 ,   ++"$,%) ,  
 
 	
r]   r   )
rU   rV   rW   r   r   r   r  r   r)   r  r\   r]   r^   r  r    s        CS(9 Cc C C C C =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r]   r  c            	           e Zd ZdefdZdedeeef         de	de
e         fdZdedeeef         deeef         fdZd	S )
Qwen2VLMultiModalProcessorr   c                 b    t          | j                                        j        j                  S r   )rl  r  r  r   r   r  s    r^   _get_data_parserz+Qwen2VLMultiModalProcessor._get_data_parser  s+    *I##%%3F
 
 	
r]   mm_itemshf_processor_mm_kwargsout_mm_kwargsc                 @  	
  | j         j        di |} | j         j        di |}| j                                         }|                                }||j                 ||j                 d
|j        dz  	dt          dt          f	
fd
fddD             S )Nr  r   item_idxrt  c                     |         |          }|| d         j         }t          |t          j                  sJ t	          |                                          z  }|         g|z  S )N	_grid_thw)rq  r5  rZ   r[   r   rb  )r  rt  out_itemr  
num_tokensmerge_lengthr  placeholders        r^   get_replacement_qwen2vlzOQwen2VLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_qwen2vl+  sm    $X.x8H8 6 6 67<Hh55555X]]__--=J)*Z77r]   c           
      `    g | ]*}t          ||         gt          |                     +S ))rt  )rt  targetreplacement)r6   r   )r   rt  r  r  s     r^   r  zBQwen2VLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>3  sY     
 
 
  !#H-.#$;hOOO  
 
 
r]   r\   )
r  r  r  get_tokenizer	get_vocabr  r  r  r   r   )r   r  r  r  r  r  	tokenizervocabr  r  r  s      `    @@@r^   _get_prompt_updatesz.Qwen2VLMultiModalProcessor._get_prompt_updates  s     2ty1KK4JKK7$)7QQ:PQQI++--	##%% <34<34
 

 '114	8c 	8S 	8 	8 	8 	8 	8 	8 	8 	8
 
 
 
 
 /
 
 
 	
r]   r[  c                 t     t          | j                                        j        j                  |          S r   )rj  r  r  r   r   )r   r[  r  s      r^   _get_mm_fields_configz0Qwen2VLMultiModalProcessor._get_mm_fields_config<  s=    

,I##%%3F
 

  	r]   N)rU   rV   rW   r2   r  r1   r   r   r
   r,   r   r7   r  r   r  r+   r  r\   r]   r^   r  r    s        
"6 
 
 
 

!
%!
 !(S 1!
 -	!

 
,	!
 !
 !
 !
F !(V 4 
++	,	     r]   r  )r  dummy_inputsc                       e Zd Z eddddd          ZdZdee         dee	e
e
e
e
ef                  fd	Zd
ee
         dee         de	ej        e
f         fdZedede
dedz  fd            Zdddedef fdZdededz  fdZdededz  fdZdede	ej        df         fdZdede	ej        df         fdZdedefdZdedefdZ	 	 d-dej        d ej        d!e dz  d"ej        dz  dedej        e z  fd#Z!d$ej        dej        dz  fd%Z"d&e#e	eej        f                  de$e         fd'Z%de&fd(Z'd)e
de
fd*Z(d+e
de
fd,Z) xZ*S ).Qwen2VLForConditionalGenerationzlanguage_model.model.visual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zlm_head.zmodel.orig_to_new_prefixTmm_featuresr   c              #     K   | j         j        j        }t          | j         j        dd          }t	          |d           D ]}|j        j        }|j        dk    rK|j        d         j        	                                \  }}}|dk    sJ d|             |d||z  ||z  dfV  e|j        d	k    r~|j        d
         j        	                                \  }}}d}	|j        
                    dd          r$|j        d         j                                        }	|	|z  }
||||z  ||z  |
fV  t          d|j                   dS )a  
        Iterate over multimodal features and yield grid information.

        Args:
            mm_features: List of multimodal feature specifications

        Yields:
            Tuple of (offset, grid_t, grid_h, grid_w, t_factor) for each frame/image
        tokens_per_second      ?c                     | j         j        S r   )mm_positionoffset)fs    r^   <lambda>zBQwen2VLForConditionalGeneration.iter_mm_grid_thw.<locals>.<lambda>j  s    AM<P r]   )r   r]  rS   r=   zImage must have 1 frame, got r_  rk   second_per_grid_tsNzUnsupported modality: )configr   r   rM  sortedr  r  rt  rq  r9  r`  item
ValueError)r   r
  r   r  
mm_featurer  r#  r$  r%  r  t_factors              r^   iter_mm_grid_thwz0Qwen2VLForConditionalGeneration.iter_mm_grid_thw\  s      "[6I#DK$=?RTWXX 2P2PQQQ 	Q 	QJ+2F"g--$/*:;@GGII1aAvvvBqBBvvva&8!8!?Q:QSVVVVVV$//$/*:;@GGII1a%("?&&';TBB ")3,*4466 ' .0AA++++     !!O*:M!O!OPPP-	Q 	Qr]   input_tokensc                 X   g }d}|                      |          D ]\  }}}}}	||z
  }
t          |          dk    r|d                                         dz   nd}|                    t	          j        t	          j        |
          d|
f          |z              t	          j        |||f          }|	dk    r+|d         |	z                      t          j	                  |d<   |                    |
                    dd          |
z   |z              |||z  |z  z   }|t          |          k     rt          |          dk    r|d                                         dz   nd}t          |          |z
  }
|                    t	          j        t	          j        |
          d|
f          |z              t	          j        |d          
                    dd          }|                                dz   t          |          z
                                  }t          j        |          |fS )Nr   r   r=   rR   r  )r2  )r  lenr  r  rO   broadcast_tor  indicesastypeint64r  r<  r  rZ   r>  )r   r  r
  llm_pos_ids_liststr  
llm_grid_t
llm_grid_h
llm_grid_wr  text_lenst_idxgrid_indicesllm_positionsmrope_position_deltas                  r^   get_mrope_input_positionsz9Qwen2VLForConditionalGeneration.get_mrope_input_positions  sA   
 "$ "";//	? 	? 
{H7:;K7L7Lq7P7P%b)--//!33VWF##	( 3 3a]CCfL   :z:z&JKKL3#/?X#="E"Ebh"O"OQ##L$8$8B$?$?($JV$STTT*z1J>>BBL!!!!7:;K7L7Lq7P7P%b)--//!33VWF<((2-H##	( 3 3a]CCfL   '7a@@@HHBOO - 1 1 3 3a 7#l:K:K KQQSS..0DDDr]   rt  iNc                 |    |                     d          rdS |                     d          rdS t          d          )Nr]  z+<|vision_start|><|image_pad|><|vision_end|>r_  z+<|vision_start|><|video_pad|><|vision_end|>z)Only image or video modality is supported)
startswithr  )clsrt  r-  s      r^   get_placeholder_strz3Qwen2VLForConditionalGeneration.get_placeholder_str  sK    w'' 	A@@w'' 	A@@DEEEr]   rr   )rw   vllm_configrw   c          
      ^   t                                                       |j        j        }|j        }|j        j        }|j        dk    | _        || _        || _        | 	                    |ddh          5  t          |j        t          |dd          |t          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t!          |t          |d          d	g
          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nrq  r]  r_  rms_norm_epsr   visual)r   rv   rw   language_modelQwen2ForCausalLM)r2  rw   architectures)rz   r{   model_configr  rv   multimodal_configmm_encoder_tp_moder   r  _mark_tower_modelr   r   rM  rF   r5  _mark_language_modelrE   r6  make_empty_intermediate_tensors)r   r2  rw   r  rv   r:  r   s         r^   r{   z(Qwen2VLForConditionalGeneration.__init__  s    + 8 B"/'4F!2!E!O!2##K'71CDD 	 	0$ >>)#FH55	  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<'#F,<==12# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s$   7;B>>CC'DDDrp  c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )NrM   ra   rS   )rN   rM   rS   )rN   ra   rS   )r  rL   r`   )r   rp  rM   ra   rS   s        r^   _parse_and_validate_image_inputz?Qwen2VLForConditionalGeneration._parse_and_validate_image_input  s     zz.$77zz.$77$4d;;L$84#*#)-    #.#)-    $#r]   c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )Nrh   rn   rk   )rN   rh   rk   )rN   rn   rk   )r  rg   rm   )r   rp  rh   rn   rk   s        r^   _parse_and_validate_video_inputz?Qwen2VLForConditionalGeneration._parse_and_validate_video_input  s     %jj)>EEzz.$77$4d;;&<+?4***$7-    #.#)-    $#r]   image_input.c                    |d         }|j         dk    sJ |d         dk    r	|d         }nP|d         }| j        r*t          | j        ||                                d          S |                     ||          }| j        j        }|                    d	          |z  |z                                  }|                    |          S )
NrS   r   rN   ra   rM   rope_3d	rope_typer  r   ndimr   rI   r5  r9  r   rb  split)r   rC  r  ra   rM   r  sizess          r^   _process_image_inputz4Qwen2VLForConditionalGeneration._process_image_input  s     /0}!!!!v.00&~6LL&~6L% L8Kx/@/@I     ${{<({KK [3
r""j0J>FFHH!!%(((r]   video_inputc                    |d         }|j         dk    sJ |d         dk    r	|d         }nP|d         }| j        r*t          | j        ||                                d          S |                     ||          }| j        j        }|                    d	          |z  |z                                  }|                    |          S )
Nrk   r   rN   rn   rh   rE  rF  rH  r   rI  )r   rN  r  rn   rh   r  rL  s          r^   _process_video_inputz4Qwen2VLForConditionalGeneration._process_video_input  s     /0}!!!!v.00&~6LL"-.C"D% S8K'OO%%'	     ${{+>{RR [3
r""j0J>FFHH!!%(((r]   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)rM   ra   images)rh   rn   videosr\   )r@  rB  )r   rp  
modalities	input_keys       r^   %_parse_and_validate_multimodal_inputszEQwen2VLForConditionalGeneration._parse_and_validate_multimodal_inputs/  s    
   
	V 
	VI===J..'Kt'K'U'Uf'U'U
8$DDDJ..'Kt'K'U'Uf'U'U
8$r]   c                 
    | j         di |}|sg S d}|D ]l}|dk    r/|d         }|                     |          }|t          |          z  }|dk    r/|d         }|                     |          }|t          |          z  }m|S )Nr\   rR  rS  )rV  rM  r   rP  )	r   rp  rT  multimodal_embeddingsrt  rC  image_embeddingsrN  video_embeddingss	            r^   embed_multimodalz0Qwen2VLForConditionalGeneration.embed_multimodalB  s    ?T?II&II
 	I ;= # 	A 	AH8##(2#'#<#<[#I#I %/?)@)@@%8##(2#'#<#<[#I#I %/?)@)@@%$$r]   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )aV  Run forward pass for Qwen2-VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,)`.
            intermediate_tensors: Intermediate tensors from prior forward pass.
            inputs_embeds: Optional tensor of input embeddings.
        N)r\  r]  r^  r_  )r6  model)r   r\  r]  r^  r_  rp  hidden_statess          r^   r   z'Qwen2VLForConditionalGeneration.forwardY  s@    ,  + M+11!5'	 2 
 
 r]   rb  c                 6    | j                             |          S r   )r6  compute_logits)r   rb  s     r^   rd  z.Qwen2VLForConditionalGeneration.compute_logitsz  s     "11-@@@r]   rA  c                 X    t          |           }|                    || j                  S )Nmapper)rC   rX  hf_to_vllm_mapper)r   rA  loaders      r^   rX  z,Qwen2VLForConditionalGeneration.load_weights  s+    "4((""743I"JJJr]   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r6  zvisual.merger.r  )r6  	connectortower_model)r%   from_string_fieldr  s    r^   get_mm_mappingz.Qwen2VLForConditionalGeneration.get_mm_mapping  s'     /+&!
 
 
 	
r]   r  c                 <    | j         }|j        }|j        }||dz  z  S Nr   r  r   r   )r   r  r  r   r  s        r^   get_num_mm_encoder_tokensz9Qwen2VLForConditionalGeneration.get_num_mm_encoder_tokens  s*     K	!/"5
*a-//r]   r  c                 <    | j         }|j        }|j        }||dz  z  S rp  rq  )r   r  r  r   r  s        r^   get_num_mm_connector_tokensz;Qwen2VLForConditionalGeneration.get_num_mm_connector_tokens  s*     K	!/"5
 JM11r]   )NN)+rU   rV   rW   rD   rh  supports_encoder_tp_datar6  r*   r   r   r   r   r  rZ   r[   r,  classmethodr   r1  r   r{   r  re   r@  ro   rB  rM  rP  rI  rV  r>   r[  r8   r   rd  r   rK  rX  r%   rn  rr  rt  r   r   s   @r^   r  r  F  s        & &=&1-
 
	 	 	  $$Q 56$Q	%S#sE12	3$Q $Q $Q $QL%E3i%E /0%E 
u|S 	!	%E %E %E %EN F3 F3 F3: F F F [F BD 
 
 
z 
3 
 
 
 
 
 
:	d	"   0	d	"   0)-)	u|S 	!) ) ) ).)-)	u|S 	!) ) ) )2f     &% %4H % % % %6 <@-1 < < 2D8	
 |d*  
+	+   BA|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
00 
0 0 0 022 
2 2 2 2 2 2 2 2r]   r  c                       e Zd ZdS )Tarsier2MultiModalProcessorN)rU   rV   rW   r\   r]   r^   rx  rx    s        Dr]   rx  c                   B     e Zd Z	 ddeeef         dz  ddf fdZ xZS )Tarsier2ImageProcessorNr  r   c                     |6d|v r2d|v r.|d         |d         d} t                      j        dd|i| d S  t                      j        dd|i| d S )Nr  r  )shortest_edger  r  r\   )rz   r{   )r   r  rp  remapped_sizer   s       r^   r{   zTarsier2ImageProcessor.__init__  s    
  4 49M9M "&l!3 $\ 2 M EGG::-:6:::::EGG11$1&11111r]   r   )rU   rV   rW   rI  r   r   r{   r   r   s   @r^   rz  rz    sf         '+2 238nt#2 
	2 2 2 2 2 2 2 2 2 2r]   rz  c                   (     e Zd Zdedef fdZ xZS )Tarsier2Processorr   r  c           
          t          di || _         t                      j        d| j        |t	          di |d d| d S )N)r  r  video_processorchat_templater\   )rz  r  rz   r{   r   )r   r   r  rp  r   s       r^   r{   zTarsier2Processor.__init__  sq      6FFFF 	
 01BBMBB		
 	

 	
 	
 	
 	
 	
r]   )rU   rV   rW   rI  r9   r{   r   r   s   @r^   r  r    sO        

 !
 
 
 
 
 
 
 
 
 
r]   r  c                   6    e Zd ZdefdZdedefdZdefdZ	dS )Tarsier2ProcessingInfor   c                 P    | j         j        j        }t          j        |          }|S r   )r  r9  ra  r   from_pretrained)r   
model_pathcorrect_configs      r^   r  z$Tarsier2ProcessingInfo.get_hf_config  s%    X*0
&6zBBr]   rp  c                 r    t          d| j                                        |                                 d|S )N)r   r  r\   )r  r  get_hf_image_processor_configr  r  s     r^   r  z'Tarsier2ProcessingInfo.get_hf_processor  sH      
(@@BB((**
 
 
 
 	
r]   c                 H    t          di | j                                        S r  )rz  r  r  r  s    r^   r  z*Tarsier2ProcessingInfo.get_image_processor  s$    %QQ(N(N(P(PQQQr]   N)
rU   rV   rW   r   r  r  r  r  rz  r  r\   r]   r^   r  r    su        }    
 
4E 
 
 
 
R%; R R R R R Rr]   r  c                   l    e Zd Z eddi          Zdeeeej	        f                  de
e         fdZdS ) Tarsier2ForConditionalGenerationzvision_tower.r  r  rA  r   c                     g }| j         |                    dg           t          | |          }|                    || j                  S )Nr  )skip_prefixesrf  )r5  extendrC   rX  rh  )r   rA  r  ri  s       r^   rX  z-Tarsier2ForConditionalGeneration.load_weights  sS    ;  )---"4}EEE""743I"JJJr]   N)rU   rV   rW   rD   rh  r   r   r   rZ   r[   rK  rX  r\   r]   r^   r  r    sv         &Y
  KHU33D-E$F K3s8 K K K K K Kr]   r  )rX   r  collections.abcr   r   r   r   r   	functoolsr   typingr	   r
   r   r   r:  rO   rZ   torch.nnr   einopsr   transformersr   transformers.models.qwen2_vlr   r   3transformers.models.qwen2_vl.configuration_qwen2_vlr   r   6transformers.models.qwen2_vl.image_processing_qwen2_vlr   6transformers.models.qwen2_vl.video_processing_qwen2_vlr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r    'vllm.model_executor.layers.quantizationr!   +vllm.model_executor.layers.rotary_embeddingr"   2vllm.model_executor.layers.rotary_embedding.commonr#   -vllm.model_executor.model_loader.weight_utilsr$   )vllm.model_executor.models.module_mappingr%   vllm.multimodalr&   vllm.multimodal.inputsr'   r(   r)   r*   r+   r,   r-   vllm.multimodal.parser.   r/   r0   r1   r2   vllm.multimodal.processingr3   r4   r5   r6   r7   vllm.sequencer8   vllm.tokenizersr9   vllm.utils.tensor_schemar:   r;   #vllm.v1.attention.backends.registryr<   
interfacesr>   r?   r@   rA   rB   rC   rD   rE   rF   visionrG   rH   rI   rU   loggerr  rL   r`   re   rY   rg   rm   ro   r   rq   r   r   r   r   r   r   r   r[   rj  rl  r}  r  r  register_processorr  rx  rz  r  r  r  r\   r]   r^   <module>r     s	  4 I H H  K K K K K K K K K K K K K K       5 5 5 5 5 5 5 5 5 5 5 5                  % % % % % % P P P P P P P P        P O O O O O X X X X X X " " " " " " 3 3 3 3 3 3 M M M M M M M M 0 0 0 0 0 0 # # # # # # ; ; ; ; ; ; X X X X X X 7 7 7 7 7 7        G F F F F F @ @ @ @ @ @      P O O O O O D D D D D D / / / / / /                                            . - - - - - ) ) ) ) ) ) > > > > > > > > D D D D D D                                  
X		  
    l   8    ,   : !8:U U I U U U    l   :    ,   : !8:U U I U U U
    RY   Dp p p p p29 p p pf2 2 2 2 2ry 2 2 2j    BI   :. . . . .RY . . .bC C C C CRY C C CL$!$!S%, C&&')$! $! $! $!N/ / / / /"6 / / /DH
 H
 H
 H
 H
. H
 H
 H
V*
 *
 *
 *
 *
 67L M *
 *
 *
Z0 0 0 0 0!89N!O 0 0 0f ('	*  
T2 T2 T2 T2 T2I!<]T2 T2 
T2n
	 	 	 	 	"< 	 	 	2 2 2 2 22 2 2 2"
 
 
 
 
( 
 
 
"R R R R R2 R R R$ ('	*  
K K K K K'F K K 
K K Kr]   