
    .`i;                       d Z ddlmZmZmZmZmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlZddlZddlmZ ddlmc mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z!m"Z" dd	l#mZ$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJmKZK ddlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[m\Z\m]Z] dd l^m_Z_ dd!l`maZa dd"lbmcZc dd#ldmeZe d$d%lfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZn d$d&lompZpmqZqmrZrmsZsmtZtmuZumvZv d$d'lwmxZxmyZymzZz d$d(l{m|Z|m}Z} d$d)l~mZmZmZmZmZ d$d*lmZmZmZ  e1e          Zd+Z G d, d-ej                  Z G d. d/ej                  Z G d0 d1ej                  Z G d2 d3ej                  Z G d4 d5ej                  Z G d6 d7ey          Z G d8 d9eYe                   Z G d: d;eZe                   Z e(dd<dddd=>           G d? d@e}                      Z G dA dBe|          Z eFj        eeeC           G dD dEej        ekeiemejehel	  	                    ZdS )FzAInference-only Qwen3VL model compatible with HuggingFace weights.    )CallableIterableIteratorMappingSequence)	lru_cachepartial)islice)AnyN)BatchFeature)Qwen2VLImageProcessorFast)smart_resize)Qwen3VLProcessorQwen3VLVideoProcessor)Qwen3VLConfigQwen3VLVisionConfig)VideoMetadata)support_torch_compile)
VllmConfig)BaseDummyOptionsVideoDummyOptions)get_pp_group)init_logger)_ACTIVATION_REGISTRY)Conv3dLayer)ColumnParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHead)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)compute_mrope_for_mediacompute_retained_tokens_countcompute_retention_maskrecompute_mrope_positions)MultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItemMultiModalKwargsItemsPlaceholderRange	VideoItem)	ImageSizeMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)
is_list_of)round_up)AttentionBackendEnum   )MultiModalEmbeddingsSupportsEagle3SupportsLoRASupportsMRoPESupportsMultiModalSupportsMultiModalPruning
SupportsPP_require_is_multimodal)Qwen2_5_VisionAttentionQwen2_5_VLImageEmbeddingInputsQwen2_5_VLImageInputsQwen2_5_VLImagePixelInputsQwen2_5_VLVideoEmbeddingInputsQwen2_5_VLVideoInputsQwen2_5_VLVideoPixelInputs)Qwen2VLMultiModalDataParserQwen2VLProcessingInfo_create_qwen2vl_field_factory)Qwen3ForCausalLM
Qwen3Model)AutoWeightsLoaderPPMissingLayerWeightsMapper_merge_multimodal_embeddingsmaybe_prefix)get_vit_attn_backendis_vit_use_data_parallel!run_dp_sharded_mrope_vision_modeli   c                   b     e Zd Z	 	 	 	 ddedededed	d
f
 fdZdej        d	ej        fdZ xZS )Qwen3_VisionPatchEmbed           
patch_sizetemporal_patch_sizein_channelshidden_sizereturnNc                     t                                                       || _        || _        || _        |||f}t          ||||d          | _        d S )NT)kernel_sizestridebias)super__init__r_   r`   rb   r   proj)selfr_   r`   ra   rb   re   	__class__s         w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/qwen3_vl.pyri   zQwen3_VisionPatchEmbed.__init__   sg     	$#6 &*J
C#
 
 
			    xc                     |j         \  }}|                    |d| j        | j        | j                  }|                     |                              || j                  }|S N)shapeviewr`   r_   rj   rb   )rk   ro   LCs       rm   forwardzQwen3_VisionPatchEmbed.forward   sT    w1FF1b$2DOT_UUIIaLLa!122rn   )r[   r\   r]   r^   )	__name__
__module____qualname__intri   torchTensorrw   __classcell__rl   s   @rm   rZ   rZ      s         #$
 

 !
 	

 
 

 
 
 
 
 
* %,        rn   rZ   c                        e Zd Zdej        ddfdedededeej	        gej	        f         de
dz  d	ef fd
Zdej	        fdZ xZS )Qwen3_VisionMLPFN in_featureshidden_featuresrg   act_fnquant_configprefixc           	          t                                                       t                      }t          ||||d| d|          | _        t          ||||d| d|          | _        || _        d S )NF.linear_fc1)rg   r   return_biasr   
disable_tp.linear_fc2)rh   ri   rW   r   
linear_fc1r   
linear_fc2r   )	rk   r   r   rg   r   r   r   use_data_parallelrl   s	           rm   ri   zQwen3_VisionMLP.__init__   s     	466.%)))(
 
 
 ,%)))(
 
 
 rn   ro   c                 |    |                      |                     |                     |                              }|S N)r   r   r   )rk   ro   
mlp_outputs      rm   rw   zQwen3_VisionMLP.forward   s1    __T[[1C1C%D%DEE
rn   )rx   ry   rz   Fsilur{   boolr   r|   r}   r   strri   rw   r~   r   s   @rm   r   r      s        
 9:26   	
 %,56 )4/      >        rn   r   c                        e Zd Zej        dddfdedededeej        gej        f         deege	j
        f         dz  dedz  d	ed
df fdZdej        dej        dej        dej        dej        d
ej        fdZ xZS )Qwen3_VisionBlockNr   dim	num_headsmlp_hidden_dimr   
norm_layerr   r   rc   c           	      4   t                                                       |t          t          j        d          } ||          | _         ||          | _        t          ||||| d          | _        t          |||d|| d          | _
        d S )Nư>epsz.attn)	embed_dimr   projection_sizer   r   Tz.mlp)r   rg   r   r   )rh   ri   r	   nn	LayerNormnorm1norm2rE   attnr   mlp)	rk   r   r   r   r   r   r   r   rl   s	           rm   ri   zQwen3_VisionBlock.__init__   s     	 4888JZ__
Z__
+%###
 
 
	 #%???
 
 
rn   ro   
cu_seqlensrotary_pos_emb_cosrotary_pos_emb_sin
max_seqlenc                     ||                      |                     |          ||||          z   }||                     |                     |                    z   }|S )Nr   r   r   r   )r   r   r   r   )rk   ro   r   r   r   r   s         rm   rw   zQwen3_VisionBlock.forward   sc     		JJqMM!11!  
 
 
 A'''rn   )rx   ry   rz   r   r   r{   r   r|   r}   r   Moduler   r   ri   rw   r~   r   s   @rm   r   r      s"        :;8<26
 

 
 	

 %,56
 cUBI-.5
 )4/
 
 

 
 
 
 
 
>< L "L	
 "L L 
       rn   r   c                        e Zd Z	 	 	 	 	 ddededeegej        f         dz  ded	ed
edz  de	ddf fdZ
dej        dej        fdZ xZS )Qwen3_VisionPatchMergerNr\   Fr   d_modelcontext_dimr   spatial_merge_sizeuse_postshuffle_normr   r   rc   c                    t                                                       t                      }||dz  z  | _        || _        | j        r| j        }|t          t          j        d          } ||          | _        t          | j        | j        d|| d|          | _
        t          j                    | _        t          | j        |d|| d|          | _        d S )Nr\   r   r   Tr   )rg   r   r   r   r   )rh   ri   rW   rb   r   r	   r   r   normr   r   GELUr   r   r   )
rk   r   r   r   r   r   r   r   r   rl   s
            rm   ri   z Qwen3_VisionPatchMerger.__init__  s     	466&*<a*?@$8!$ 	+*K 4888JJ{++	.%)))(
 
 
 gii+%)))(
 
 
rn   ro   c                 X   | j         r/|                     |                    d| j                            }n.|                     |                              d| j                  }|                     |          \  }}|                     |          }|                     |          \  }}|S rq   )r   r   rt   rb   r   r   r   )rk   ro   
x_parallel_outs        rm   rw   zQwen3_VisionPatchMerger.forward,  s    $ 	8		!&&T%56677AA		!!!"d&677A**
A[[,,
,,Q
rn   )Nr\   FNr   )rx   ry   rz   r{   r   r   r   r   r   r   ri   r|   r}   rw   r~   r   s   @rm   r   r     s        
 9="#%*26%
 %
%
 %
 cUBI-.5	%

  %
 #%
 )4/%
 %
 
%
 %
 %
 %
 %
 %
N	 	%, 	 	 	 	 	 	 	 	rn   r   c                       e Zd Z	 	 	 ddedededz  deddf
 fd	Zede	j
        fd
            Z
ede	j        fd            Ze ed          dededede	j        fd                        Zdeee                  fdZdeee                  de	j        fdZde	j        de	j        fdZde	j        de	j        eee                  z  de	j        fdZdeeee	j        f                  dee         fdZ xZS )Qwen3_VisionTransformerr   Nr   vision_confignorm_epsr   r   rc   c           	          t                                                       j         _        j         _        j         _        j         _        j         _         j        dz   _        j         _        j	         _	        t           j        dz             _        j        dt           j	                  z   z   _        t           j         j        j         j                   _        t#          j         j         j                   _        t)          t"          j        |           j         j        z  }t-          |ddddi	           _        t1          j         j         j         d
           _        t#          j         fdt7          t           j	                            D                        _        t;          |t=          j                               _          j         tB          j"        tB          j#        tB          j$        hvrtK          d j          d          t#          j         fdt7          j&                  D                        _'        d S )Nr\   g      ?r<   )r_   r`   ra   rb   r   i    Tpartial_rotary_factor)	head_sizemax_positionis_neox_stylerope_parametersz.merger)r   r   r   r   r   r   c                 d    g | ],}t          j        j        j        d  d|           -S )Tz.deepstack_merger_list.)r   r   r   r   r   r   r   )r   out_hidden_sizerb   r   .0	layer_idxr   r   r   rk   r   s     rm   
<listcomp>z4Qwen3_VisionTransformer.__init__.<locals>.<listcomp>m  sf         ()9 $ 0'+'>)-)!-$HHYHH    rn   )r   dtypezQwen3-VL does not support z backend now.c                     g | ]<}t          j        j        j        t          j                  d |           =S )z.blocks.)r   r   r   r   r   r   r   )r   rb   r   intermediate_sizer   
hidden_actr   s     rm   r   z4Qwen3_VisionTransformer.__init__.<locals>.<listcomp>  sm         "("n#0#B/0HI)!-$99i99    rn   )(rh   ri   rb   r   num_position_embeddingsr_   r   spatial_merge_unitr`   deepstack_visual_indexesr{   num_grid_per_sider   lenrZ   ra   patch_embedr   	Embedding	pos_embedr	   r   r    rotary_pos_embr   merger
ModuleListrangedeepstack_merger_listrV   r|   get_default_dtypeattn_backendr;   
FLASH_ATTN
TORCH_SDPAROCM_AITER_FARuntimeErrordepthblocks)rk   r   r   r   r   head_dimr   rl   s   `` `` @rm   ri   z Qwen3_VisionTransformer.__init__9  s    	(4&0'4'L$'2"/"B"&"91"<#0#D (5(N%!$T%A3%F!G!G  -<D1222 
 2 $ 8%1(	
 
 
 d&BDDTUUR\x888
#t~5&4c:	
 
 
 .!1(!#6%%%%
 
 
 &(]        "'s4+H'I'I!J!J  &
 &
" 1)++
 
 

  + + .%
 
 

 MT->MMM   m        "'}':!;!;  
 
rn   c                 .    | j         j        j        j        S r   )r   rj   weightr   rk   s    rm   r   zQwen3_VisionTransformer.dtype  s    $+11rn   c                 .    | j         j        j        j        S r   )r   rj   r   devicer   s    rm   r   zQwen3_VisionTransformer.device  s    $+22rn   i   )maxsizehwr   c                 n   t          j        t          j        |                               | d          | |f          }| |z  }||z  }|                    ||||          }|                    dddd          }|                                }t          j        t          j        |                              d|          | |f          }|                    ||||          }|                    dddd          }|                                }t          j        t          j        ||gd                    S )Nr<   r   r\   r]   rr   axis)	npbroadcast_toarangereshape	transposeflattenr|   
from_numpystack)r   r   r   hpos_idsh_divw_divwpos_idss          rm   rot_pos_idsz#Qwen3_VisionTransformer.rot_pos_ids  s3    ?29Q<<#7#71#=#=1vFF''''##	
 
 %%aAq11##%%?29Q<<#7#71#=#=1vFF##	
 
 %%aAq11##%%(H)=B G G GHHHrn   grid_thwc                 `    t          d |D                       } fd|D             }t          j        |d                               j        d          } j                            |          \  }}||                             d          }||                             d          }||fS )Nc              3   >   K   | ]\  }}}t          ||          V  d S r   max)r   r   r   r   s       rm   	<genexpr>z6Qwen3_VisionTransformer.rot_pos_emb.<locals>.<genexpr>  s0      >>'!QC1II>>>>>>rn   c                     g | ]X\  }}}|d k    r                     ||j                  n/                     ||j                                      |d           YS )r<   )r   r   repeat)r   tr   r   rk   s       rm   r   z7Qwen3_VisionTransformer.rot_pos_emb.<locals>.<listcomp>  s|     
 
 
 1a Avv Q4#:;;;!!!Q(?@@GG1MM
 
 
rn   r   r   Tnon_blockingr<   )r  r|   cattor   r   get_cos_sinr   )rk   r   max_grid_sizepos_idscossincos_combinedsin_combineds   `       rm   rot_pos_embz#Qwen3_VisionTransformer.rot_pos_emb  s    >>X>>>>>
 
 
 
 $	
 
 
 )G+++..t{.NN &22=AAS7|++A..7|++A..\))rn   c                    | j         }| j        }| j        j        }g }|D ]\  }}}t	          j        d|dz
  |t          j        | j                  }	t	          j        d|dz
  |t          j        | j                  }
|	                    t          j	                  }|
                    t          j	                  }t	          j
        |dz   |dz
            }t	          j
        |dz   |dz
            }|	|z
  }|
|z
  }t	          j        ||d          \  }}t	          j        ||d          \  }}t	          j        ||d          \  }}||z  }||z
  }||z
  }d|z
  |z
  }t	          j        ||||g          }t	          j        ||||g          }||z  }||z                       dd          }t	          j        ||||gd	                              ddd          }|                    | j        
          }|                     |          } | |z  } |                     d	          }!|!                    ||z  |||z  ||          }!|!                    ddddd                              dd|          }!|!                    |dd                              d|          }"|                    |"           t	          j        |d	          S )Nr   r<   )r   r   r  ij)indexing   rr   r  r   r\   r]   )r   r   r   embedding_dimr|   linspacefloat32r   r  longclampmeshgridr   r   r   sumpermuteexpandappendr  )#rk   r   r   m_size
hidden_dimoutputsr  r   r   h_idxsw_idxsh_floorw_floorh_ceilw_ceildhdwdh_griddw_gridh_floor_gridw_floor_gridh_ceil_gridw_ceil_gridw11w10w01w00h_gridw_grid
h_grid_idxindicesweightsembedscombinedrepeateds#                                      rm   fast_pos_embed_interpolatez2Qwen3_VisionTransformer.fast_pos_embed_interpolate  s    2(^1
 2	% 2	%GAq!^$q(!5=  F ^$q(!5=  F ii
++Gii
++G[12Ca2GHHHF[12Ca2GHHHF'!B'!B  %~b"tDDDGW).SW)X)X)X&L,',~fft'T'T'T$K G#CC-CC-Cg+#C[,k;!WXXF[,\;!WXXF"33J!F*33Ar::Gk3S#"6A>>>FFq"aPPGjjtzj22G^^G,,FgFzzaz((H''VVQ&[&* H  ''1aA66>>q"jQQHq"b1199"jIIHNN8$$$$ya((((rn   r   c                     t          j        g |j                  }| j        t          j        k    s| j        t          j        k    r'|dd          |d d         z
                                  }|S )N)r   r<   rr   )r|   zerosr   r   r;   r   r   r  )rk   r   r   s      rm   compute_attn_mask_seqlenz0Qwen3_VisionTransformer.compute_attn_mask_seqlen  sk     [J,=>>>
!5!@@@ $8$FFF$QRR.:crc?:??AAJrn   ro   c                    |                     | j        | j        d          }|                     |          }t	          |t
                    r#|}t          j        |t          j                  }n(|	                                }|
                                }|                     |          }||z   }|                     |          \  }}t          j        |d d df         |d d df         z  |d d df                                       dt          j                  }t          j        t          j        dt          j                  |g          }t#          j        |          }|                    d          }|                     |          }	|                     | j        d          }g }
t+          | j                  D ]c\  }} ||||||		          }|| j        v rE| j                            |          } | j        |         |          }|
                    |           d|                     |          }t#          j        |g|
z   d
          }|S )NT)r   r   r
  r  r<   r\   r   )r   r   r	  r   r  )r  r   r   r   
isinstancelistr   arrayint32tolistnumpyrA  r  r  cumsumconcatenaterC  r|   r   	unsqueezerD  	enumerater   r   indexr   r#  r   r  )rk   ro   r   hidden_statesgrid_thw_list
pos_embedsr   r   r   r   deepstack_feature_lists	layer_numblkdeepstack_merger_idxdeepstack_features                  rm   rw   zQwen3_VisionTransformer.forward  se   
 DKtzPTUU((77h%% 	($Mx999HH$OO--M~~''H44]CC
%
2151A1A-1P1P..Yx1~A>AOOVV"( W 
 

 ^RXarx%@%@%@*$MNN
%j11
%//2222:>>
]]4;T]BB
"$'44 	B 	BNIsC%#5#5%  M D999'+'D'J'J9'U'U$$TD$>?S$T!% %! (../@AAAM22	O551
 
 
 rn   r=  c                    g d}t          |                     d                    }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))	attn.qkv.zattn.q.q)rZ  zattn.k.k)rZ  zattn.v.vF)remove_duplicateweight_loader)dictnamed_parameterssetreplacer_  getattrr"   add)rk   r=  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr_  s               rm   load_weightsz$Qwen3_VisionTransformer.load_weightsF  s    "
 "
 "
 400%0HHII"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rn   )r   Nr   )rx   ry   rz   r   floatr   r   ri   propertyr|   r   r   staticmethodr   r{   r}   r   rG  r  rA  rD  rw   r   tuplerb  ro  r~   r   s   @rm   r   r   8  s4        26\
 \
*\
 \
 )4/	\

 \
 
\
 \
 \
 \
 \
 \
| 2u{ 2 2 2 X2 3 3 3 3 X3 YtIs Is I I I I I  \I2*DcO * * * *$:)4S	? :)u| :) :) :) :)x
L
 

 
 
 
0<0 ,d3i00 
	0 0 0 0dHU33D-E$F 3s8        rn   r   c                   n    e Zd Zd ZdedefdZdedefdZdede	fdZ
ddd	d
ededededee	z  dz  deeef         fdZd!dededef fdZdedeeef         def fdZdedeeef         defdZdee         ej        z  dedefdZ	 	 d"deeef         dededz  dedz  dee         f
d Z xZS )#Qwen3VLProcessingInfoc                 @    | j                             t                    S r   )ctxget_hf_configr   r   s    rm   rx  z#Qwen3VLProcessingInfo.get_hf_configc  s    x%%m444rn   kwargsrc   c                 ^     | j         j        t          fd|                    dd          i|S )Nuse_fastT)rw  get_hf_processorr   poprk   ry  s     rm   r|  z&Qwen3VLProcessingInfo.get_hf_processorf  sC    (tx(
 
ZZ
D11
 
 
 	
rn   c                 &     | j         di |j        S N )r|  image_processorr~  s     rm   get_image_processorz)Qwen3VLProcessingInfo.get_image_processorm      $t$..v..>>rn   c                 &     | j         di |j        S r  )r|  video_processorr~  s     rm   get_video_processorz)Qwen3VLProcessingInfo.get_video_processorp  r  rn   r\   T)
num_frames	do_resizeimage_widthimage_heightr  r  r  Nc          	      V   ||dk    r|                                  }n||                                 }t          |t                    }|                                 }|j        }|j        }	|j        }
|j        }|rT|rt          }||d}n	t          }i } |d|||	|
z  |j        d         |j        d         d|\  }}t          ||          }nt          ||          }t          ||          }t          ||z  d          }|j        |	z  }|j        |	z  }||z  |z  }||
dz  z  }||fS )	Nr<   )r  temporal_factorshortest_edgelongest_edge)heightwidthfactor
min_pixels
max_pixels)r  r  r\   r  )r  r  rF  r   rx  r   r_   r   r`   video_smart_resizeimage_smart_resizesizer0   r:   r  r  r  )rk   r  r  r  r  r  is_video	hf_configr   r_   
merge_sizer`   r   extra_kwargsresized_heightresized_widthpreprocessed_sizepadded_num_framesgrid_tgrid_hgrid_wnum_patchesnum_vision_tokenss                          rm   _get_vision_infoz&Qwen3VLProcessingInfo._get_vision_infos  s    "zA~~"6688OO$"6688Oo/DEE&&((	!/"-
"5
+? 	R "1",':   
  2!,8L -#!!J.*/@*/?- - - -)NM !*n U U U )L Q Q Q$Z1DEE&*==qAA")Z7"(J6vo.'JM: "333rn   
max_tokensstart_num_framesc                 J    t                                          ||          S )N)r  )rh   _get_max_video_frames)rk   r  r  rl   s      rm   r  z+Qwen3VLProcessingInfo._get_max_video_frames  s*    ww,,)9 - 
 
 	
rn   seq_len	mm_countsc                 V    t                                          ||t                    S )N)max_frames_per_video)rh   !get_num_frames_with_most_featuresDUMMY_VIDEO_NUM_FRAMES)rk   r  r  rl   s      rm   r  z7Qwen3VLProcessingInfo.get_num_frames_with_most_features  s.    
 ww88Y5K 9 
 
 	
rn   c                     |                                  }|j        d         }|                     ||j        z            \  }}|                     ||dd           }|S )Nr  r  r\   r  r  r  r  )r  r  !get_image_size_with_most_featuresr`   get_num_video_tokens)rk   r  r  r  video_max_pixelstarget_widthtarget_heightnum_video_soft_tokenss           rm   get_max_video_tokensz*Qwen3VLProcessingInfo.get_max_video_tokens  s    
 2244*/? '+&L&L'?+NN 'M '
 '
#m !% 9 9$& 	 !: !
 !
 %$rn   r<  	video_fpsr  c                 @   t          |t                    s|                                }t          |          z  dk    r"||d         gt          |          z  z
  z  z   }fd|D             fdt	          dt                              D             S )Nr   rr   c                     g | ]}|z  S r  r  )r   idxr  s     rm   r   z?Qwen3VLProcessingInfo._calculate_timestamps.<locals>.<listcomp>  s    999#cIo999rn   c                 D    g | ]}|         |z   d z
           z   dz  S )r<   r\   r  )r   ir  
timestampss     rm   r   z?Qwen3VLProcessingInfo._calculate_timestamps.<locals>.<listcomp>  sE     
 
 
 ]ZJ(:;;q@
 
 
rn   )rF  rG  rJ  r   r   )rk   r<  r  r  r  s     ``@rm   _calculate_timestampsz+Qwen3VLProcessingInfo._calculate_timestamps  s     '4(( 	'nn&&Gw<<*$))c'llZ>W1W XXG9999999

 
 
 
 
1c*ooz::
 
 

 rn   metadataout_itemdo_sample_framessampled_fpsc                 0   |                                  }|j        }|d         }|d         }||                    dd          }|r|r|n|j        }|d         }	t	          |	|d         z  |z            }
t          t          t          |
|j                  |j                  |	          }
t          j
        d|	dz
  |
                                                              t                                                    }|                     |||          }|S )Nframes_indicesfpsr  Ftotal_num_framesr   r<   )r  r  getr  r{   minr  
min_frames
max_framesr   r  roundastyperJ  r  )rk   r  r  r  r  r  r  r<  r  r  r  r  s               rm   _get_video_second_idxz+Qwen3VLProcessingInfo._get_video_second_idx  s$    2244$/
+, UO	#'||,>FF
  	 *5M++/:MK'(:;-?+MNNJ
O$>??#.  ! J A/!3Z@@	  //JOO
rn   )r\   NN)rx   ry   rz   rx  objectr   r|  r   r  r   r  r{   r   rs  r0   r  r  r   r   r  r  rG  r|   r}   rp  r  r`  r   r,   r  r~   r   s   @rm   ru  ru  b  s_       5 5 5
 
4D 
 
 
 
?F ?7P ? ? ? ??F ?7L ? ? ? ? 54 54 54 54 	54
 54 54 35JJTQ54 
y#~	54 54 54 54n
 
 
s 
SV 
 
 
 
 
 



 38$
 
	
 
 
 
 
 
%% 38$% 
	% % % %(Cy5</<AOR   & )-$(' 'sCx.' '' +	'
 T\' 
c' ' ' ' ' ' ' 'rn   ru  c                       e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	ded	ed
edede
e         f
dZdS )Qwen3VLDummyInputsBuilderr  rc   c                 x    |                     dd          }|                     dd          }d}d}||z  ||z  z   S )Nimager   video+<|vision_start|><|image_pad|><|vision_end|>+<|vision_start|><|video_pad|><|vision_end|>)r  )rk   r  
num_images
num_videosimage_tokenvideo_tokens         rm   get_dummy_textz(Qwen3VLDummyInputsBuilder.get_dummy_text  sG    ]]7A..
]]7A..
CCZ'+
*BBBrn   Nr  
mm_optionsc                 ~   |                     dd          }|                     dd          }|r|                     d          nd }|r|                     d          nd }| j                                        \  }}	d}
|rst          |t                    sJ |j        }|rS||
k    rt                              d||
           |dk     rt                              d|           t          |
|          }
t          |
d          }
| j        
                                }|j        d         }| j                            ||j        z            \  }}| j                            |||
|	          \  }}|j        |j        }}|rt          |t                    sJ |j        }|r2||k    rt                              d
||           t          ||          }|j        }|r2||k    rt                              d||           t          ||          }|                     ||	||          |                     |||
|          dS )Nr  r   r  r\   z]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzEvideo.num_frames override (%d) cannot be less than 2, will be ignoredr  r  r  zMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignored)r  r  r  	overrides)r  r  r  r  )r  r  )r  infor  rF  r   r  loggerwarningr  r  r  r  r`   r  r  r  _get_dummy_images_get_dummy_videos)rk   r  r  r  r  r  image_overridesvideo_overridestarget_image_widthtarget_image_heighttarget_num_framesnum_frames_overrider  r  target_video_widthtarget_video_heighttarget_video_sizer   width_overrideheight_overrides                       rm   get_dummy_mm_dataz+Qwen3VLDummyInputsBuilder.get_dummy_mm_data  s    ]]7A..
]]7A..
5?I*..111T5?I*..111T I7799 	0/
  	Po/@AAAAA"1"<" P&):::NNI+)	   '**NN2+  
 %((9;N$O$O! 1155)7799*/? I77+/RR 8   	0/
  $y99*,(+	  :  
  
1 #$ 0  	Po/@AAAAA,2N M!$666NN>&*	   &));^%L%L"-4O P"%888NN?'+	   '**=&O&O# ++(*%)	 ,   ++(*,%	 ,  
 
 	
rn   r  r  r  r  c                   t          j        |||dfdt           j                  }g }t          |          D ]P}d|dz  |d t          |          D             ddd}|                                |f}	|                    |	           Q|S )	Nr]      r  g       @c                     g | ]}|S r  r  )r   r  s     rm   r   z?Qwen3VLDummyInputsBuilder._get_dummy_videos.<locals>.<listcomp>~  s    "@"@"@1"@"@"@rn   opencvF)r  durationr  r  video_backendr  )r   fulluint8r   copyr#  )
rk   r  r  r  r  r  video_itemsr  video_metadata
video_items
             rm   r  z+Qwen3VLDummyInputsBuilder._get_dummy_videoso  s     UFA628LLLz"" 
	+ 
	+A&,$."@"@eJ.?.?"@"@"@!)$) N  **,,7Jz****rn   r   )rx   ry   rz   r   r   r{   r  r   r)   r  rG  r/   r  r  rn   rm   r  r    s        CS(9 Cc C C C C =A	^
 ^
^
 38$^
 C!112T9	^

 
^
 ^
 ^
 ^
@  	
   
i     rn   r  c            
            e Zd ZdefdZdedeeef         deeef         deeef         def
 fdZ	ded	eeef         deee
f         fd
Zded	eeef         dedee         fdZ xZS )Qwen3VLMultiModalProcessorrc   c                 f    t          | j                                        j        j        d          S )NT)video_needs_metadata)rL   r  rx  r   r   r   s    rm   _get_data_parserz+Qwen3VLMultiModalProcessor._get_data_parser  s3    *I##%%3F!%
 
 
 	
rn   promptmm_data	mm_kwargs
tok_kwargsc                    t          |          } | j        j        di |}|                    dg           x}rJg }g }|D ]}	|	\  }
t          di |}d|vr                    dd          |d<   t          di fdD             t                      }|
gg|d<   gg|d<   t                                          d|||          }|                    d          }|j        	                    |          d	         }|
                    d|d
          }|                    |d                    |                    |d                    t          t          j        |          t          j        |                    }nt                      }t                                          ||||          }t          |fi |}t          |          S )Nvideosr  Fc                 .    i | ]}|d k    ||         S )r  r  )r   r\  r  s     rm   
<dictcomp>zAQwen3VLMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>  s*    SSS!1@R;R;Rq(1+;R;R;Rrn   r  r  )r  r  r  r  	input_idsr   r<   video_grid_thwpixel_values_videos)r  r  r  )r`  r  r|  r}  r  r   rh   _call_hf_processor	tokenizerbatch_decoderc  r#  r|   r  r   )rk   r  r  r  r  	processorr	  video_grid_thw_lstpixel_values_videos_lstitemvideo_arrayvideo_mm_kwargsvideo_mm_datavideo_outputsr  video_placeholderprocessed_outputscombined_outputsr  rl   s                     @rm   r  z-Qwen3VLMultiModalProcessor._call_hf_processor  sV    w--.DI.;;;;	 [[2...6 3	#!#&(# )U )U(,%X #'"3"3"3"3%_<< ;C,,*E; ;O$67 )  SSSSxSSS  !%,7=/h'4<:,./ % : :H)-)	 !; ! ! *--k::	$-$7$D$DY$O$OPQ$R!A%  #))-8H*IJJJ'..}=R/STTTT $)I.E$F$F$y);<<  MM
 !FFM!GG66!	 7 
 
  
 

 
 ,---rn   	hf_inputshf_processor_mm_kwargsc                 t     t          | j                                        j        j                  |          S r   )rN   r  rx  r   r   )rk   r  r  s      rm   _get_mm_fields_configz0Qwen3VLMultiModalProcessor._get_mm_fields_config  s=    

,I##%%3F
 

  	rn   mm_itemsout_mm_kwargsc           
         	
   j         j        d	i   j         j        d	i } j                                         
 j                                         }|j        |j        |j        |j        dz  	dt          f	fd}dt          f	 
f	d}t          dj        |          t          dd|          gS )
Nr\   item_idxc                     d         |          }|d         j         }t          |t          j                  sJ t	          |                                          z  }j        g|z  S )Nr  image_grid_thw)datarF  r|   r}   r{   prodimage_token_id)r$  r  r   
num_tokenshf_processormerge_lengthr"  s       rm   get_image_replacement_qwen3vlzUQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_image_replacement_qwen3vl  sc    $W-h7H 016Hh55555X]]__--=J /0:==rn   c                 `  	 d         |          }|d         j         }t          |t          j                  sJ d         |          \  }}                    d          }                    d          }t          |t                    r||          }j                            ||||          }t          |          |d         k    s$J dt          |           d|d          d            fd	|D             }t          |d
d                                                    z  fd|D             }	j        j                                        j        }
|
|
dk    rt          t          |          |
          }t          |          dk    rg }	nt          |          d
k    rg}	n}t!          ||z
  d          }|t          |          d
z
  z  }|t          |          d
z
  z  }|g}	t#          d
t          |                    D ]'}||d
z
  |k     rd
ndz   }|	                    |           (g }t'          |          D ]X\  }}|                    |           |	|t          |	          k     r|nd         }|                    gg|z  z   gz              Yt+          j        |          S )Nr  r  r  r  r   zThe timestamps length(z ) should be equal video length (z).c                 H    g | ]}                     d |ddd          S )<z.1fz	 seconds>F)add_special_tokens)encode)r   	curr_timer  s     rm   r   ziQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_qwen3vl.<locals>.<listcomp>  sL             !=Y!=!=!=!=RW XX     rn   r<   c                     g | ]}S r  r  )r   r   tokens_per_frames     rm   r   ziQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_qwen3vl.<locals>.<listcomp>  s    %Q%Q%Q1&6%Q%Q%Qrn           rr   )r'  rF  r|   r}   r  r9   rp  r  r  r   r{   r(  rw  get_mm_configvideo_pruning_rater&   r  r   r#  rO  extendr7   select_token_id)r$  r  r   r  r  r  r  r  frames_idx_tokenper_frame_token_countsr8  total_retainedfirst_frame_tokensremaining_tokensbase	remainder	frame_idxextraplaceholdertimestamp_tokenstokens_this_framer5  r  r,  r!  r"  rk   r  video_token_idvision_end_token_idvision_start_token_ids                        @rm   get_video_replacement_qwen3vlzUQwen3VLMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement_qwen3vl  sX   $W-h7H 016Hh55555&w/9OE8599:LMM044U;;K+u-- 4)(388($4k J z??hqk1111Z 1 1!)!1 1 1 211
       !+       #8ABB<#4#4#6#677<G%Q%Q%Q%Q@P%Q%Q%Q"!%!<!<!>!>!Q!-2Ds2J2J!>$())&" "
 '((A---/**)**a//.>-?**)9&'*><N+NPQ'R'R$+4D0E0E0IJD 0C8H4I4IA4M NI.@-A*%*1c2B.C.C%D%D = =	 $i!my-H-Ha P.55e<<<<K/89I/J/J 	 	+	+""#3444$:!*S1G-H-H!H!HIIb%! ""*+%&)::;*+,   
 '6{NSSSrn   r  )modalitytargetreplacementr  r  r  )r  r|  r  get_tokenizerrx  rG  rI  rH  r  r{   r5   r  )rk   r!  r  r"  r  r  r-  rJ  r+  r,  r  rG  rH  rI  s   ````    @@@@@@rm   _get_prompt_updatesz.Qwen3VLMultiModalProcessor._get_prompt_updates  sd    2ty1KK4JKK7$)7QQ:PQQI++--	I++--	"1 ) ?';&114	>C 	> 	> 	> 	> 	> 	> 	> 	>:	TC :	T :	T :	T :	T :	T :	T :	T :	T :	T :	T :	T :	T :	T :	Tz  #/9    D9  
 	
rn   )rx   ry   rz   r2   r  r   r   r  r   r  r+   r   r1   r   r-   r   r6   rO  r~   r   s   @rm   r   r     sH       
"6 
 
 
 
K.K. f%K. 3;'	K.
 CK(K. 
K. K. K. K. K. K.Z !(V 4 
++	,	   b
%b
 !(S 1b
 -	b

 
,	b
 b
 b
 b
 b
 b
 b
 b
rn   r   rr   r  	positionsintermediate_tensorsinputs_embedsdeepstack_input_embeds)dynamic_arg_dimsc                        e Zd Zdddedef fdZ	 	 	 ddej        dej        d	edz  d
ej        dz  dedz  dej        ez  fdZ	 xZ
S )Qwen3LLMModelr   r   vllm_configr   c                    t                                          ||           t                      j        s6| j        t          |j        j        j        j	                  k    sJ d            d S d S )NrY  r   zLstart_layer should be greater than or equal to len(deepstack_visual_indexes))
rh   ri   r   is_first_rankstart_layerr   model_configr  r   r   )rk   rY  r   rl   s      rm   ri   zQwen3LLMModel.__init__V  s    [@@@~~+ 	#s(2@Y( (   0  	 	 rn   Nr  rQ  rR  rS  rT  rc   c                 b   t                      j        r||}n|                     |          }d }n|J |d         }|d         }g }t          t	          | j                  | j        | j                  D ]e\  }	}
|	| j        v r|	                    ||z               |
|||          \  }}|-|	t          dt          |                    v r||d|	          z   }ft                      j        st          ||d          S |                     ||          \  }}t          |          dk    r||fS |S )NrQ  residualr   deepstack_input_embeds_)rQ  r`  )r   r\  embed_input_idsr
   rO  layersr]  	end_layeraux_hidden_state_layersr#  r   r   is_last_rankr8   r   )rk   r  rQ  rR  rS  rT  rQ  r`  aux_hidden_statesr   layerr   s               rm   rw   zQwen3LLMModel.forward`  s    >>' 		8( - $ 4 4Y ? ?HH'3330AM+J7H &dk""D$4dn!
 !
 	 	Iu D888!(()ABBB&+e' '#M8 &1i53-..D D 7 7 ",-Ry-R-RST 
 ~~* 	&"/XFF    99]H==q !!A%% "333rn   )NNN)rx   ry   rz   r   r   ri   r|   r}   r8   rw   r~   r   s   @rm   rW  rW  I  s         BD   z 3       <@-1=A1 1<1 <1 2D8	1
 |d*1 !4d :1 
+	+1 1 1 1 1 1 1 1rn   rW  c                   .     e Zd Zdddedef fdZ xZS )Qwen3LLMForCausalLMr   rX  rY  r   c                   t          t          |                                            |j        j        j        }|j        }|| _        || _        t          |t          |d                    | _
        t                      j        r<|j        r| j
        j        | _        n6t!          |j        |j        |d          | _        nt'                      | _        t)          |j                  | _        | j
        j        | _        d S )Nmodelr[  lm_head)r   r   )rh   rO   ri   r^  r  text_configr   configrW  rU   rl  r   rf  tie_word_embeddingsembed_tokensrm  r!   
vocab_sizerb   rR   r   logits_processormake_empty_intermediate_tensors)rk   rY  r   ro  r   rl   s        rm   ri   zQwen3LLMForCausalLM.__init__  s    %%..000)3?"/("#L,I,I
 
 

 >>& 	,) #z6-%&!-$	      *++DL /0A B B J6 	,,,rn   )rx   ry   rz   r   r   ri   r~   r   s   @rm   rj  rj    sX        AC 
 
 
z 
3 
 
 
 
 
 
 
 
 
 
rn   rj  )r  dummy_inputsc                   b    e Zd Zg dddgdgdZdZ eddd	d
          Zedede	dedz  fd            Z
dddedef fdZdee	df         ddfdZdee	df         fdZde	dedz  fdZdej        ddfdZde	ddfdZdededz  fd Zdededz  fd!Zd"edeej        df         fd#Zd$edeej        df         fd%Zd&eej        df         d"edeej        df         fd'Zd(eej        df         d$edeej        df         fd)Zdedefd*Z d+e!e	         d,e!e"         de#ee	e	e	f                  fd-Z$d.e%d/e	de!ej                 dz  fd0Z&d.e%d/e	de!e	         dz  fd1Z'd.e%d/e	de!e	         dz  fd2Z(d3e!e	         d4eej        df         d5ej)        d6e	deeej        df         ej        e	f         f
d7Z*d+e!e	         d,e!e"         deej        e	f         fd8Z+dede,dz  fd9Z-d:ej        d4e,d;ej        deej        e,f         fd<Z.	 dMdd=d>d3ej        d4e,dz  d;ej        dz  d?e/dej        f
d@Z0	 	 dNd3ej        dAej        dBedz  d:ej        dz  dedej        ez  fdCZ1dDej        dej        dz  fdEZ2dFe3eeej        f                  de4e         fdGZ5de6fdHZ7dIe	de	fdJZ8dKe	de	fdLZ9 xZ:S )OQwen3VLForConditionalGeneration)q_projk_projv_proj	gate_projup_projqkv)qkv_projgate_up_projr}  Tvisual.zlanguage_model.lm_head.zlanguage_model.model.)zmodel.visual.zlm_head.zmodel.language_model.)orig_to_new_prefixrK  r  rc   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nr  r  r  r  z)Only image or video modality is supported)
startswith
ValueError)clsrK  r  s      rm   get_placeholder_strz3Qwen3VLForConditionalGeneration.get_placeholder_str  sK    w'' 	A@@w'' 	A@@DEEErn   rl  rX  rY  r   c          
         t                                                       j        j        j        }j        j        }| _        || _        |j        dk    | _        |j	        | _	        |
                                | _
        t          j        d          | _        | j        rt          j        j                  nd| _        j        j        | _        | j        | j        z  | _        |                     ddh          5  t+          j        t-          dd          |t/          |d          	          | _        | j        r&fd
t3          | j                  D             | _        d d d            n# 1 swxY w Y   |                               5  t9          t/          |d                    | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nr'  r   r   r  r  rms_norm_epsr   visual)r   r   r   c                 b    g | ]+}t          j        j        j        j        j                  ,S r  )r|   rC  scheduler_configmax_num_batched_tokensrn  rb   )r   r   ro  rY  s     rm   r   z<Qwen3VLForConditionalGeneration.__init__.<locals>.<listcomp>  sI     / / /
 	 K#4K*6 / / /rn   language_modelr[  )rh   ri   r^  r  r   multimodal_configro  mm_encoder_tp_moder   r8  is_multimodal_pruning_enabledhasattrr   use_deepstackr   r   deepstack_num_levelr   
visual_dimmultiscale_dim_mark_tower_modelr   rd  rU   r  r   rT  _mark_language_modelrj  r  rt  )rk   rY  r   r   r  ro  rl   s    `   @rm   ri   z(Qwen3VLForConditionalGeneration.__init__  s    + 8 B"/'4F!2!2!E!O"3"F;;== 	* %V%9;UVV !C$=>>> 	 
 !.>"o0HH##K'71CDD 	 	1$ >>)#FH55	  DK ! / / / / /
 #4#;<</ / /+	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$ &&{33 	 	"5'VEU0V0V# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   A(E88E<?E<%G		GGrc  .c                 (    || j         j        _        d S r   )r  rl  re  )rk   rc  s     rm   set_aux_hidden_state_layersz;Qwen3VLForConditionalGeneration.set_aux_hidden_state_layers  s    <B!999rn   c                 T    t          | j        j        j                  }d|dz  |dz
  fS )Nr\   r]   )r   r  rl  rc  )rk   
num_layerss     rm   "get_eagle3_aux_hidden_state_layerszBQwen3VLForConditionalGeneration.get_eagle3_aux_hidden_state_layers  s-    ,29::
:?JN33rn   r*  c                      t           dd           sd S t           fdt           j                  D                       S )NrT  c                 B    i | ]}d | j         |         d         S )ra  N)rT  )r   r  r*  rk   s     rm   r  zOQwen3VLForConditionalGeneration._get_deepstack_input_embeds.<locals>.<dictcomp>&  sI         0#//1LS1QZK2  rn   )rd  r8   r   r  )rk   r*  s   ``rm   _get_deepstack_input_embedsz;Qwen3VLForConditionalGeneration._get_deepstack_input_embeds  sl     t5t<< 	4 #     !!9::	  
 
 	
rn   rT  c                 x    t           dd           sd S |                    d           j        d                             d          k    r& fdt           j                  D              _        t           j                  D ]0} j        |         d                              ||                    1d S )NrT  r<   r   c           	          g | ]I}t          j        j        j        j        j        d          j        j        d          j                  JS )r   r   r   )r|   rC  ro  rn  rb   rT  r   r   )r   r   r*  rk   s     rm   r   zOQwen3VLForConditionalGeneration._set_deepstack_input_embeds.<locals>.<listcomp>5  sf     + + +  K+76q9@5a8>	  + + +rn   )rd  r  rT  r   r  copy_)rk   rT  r  r*  s   `  @rm   _set_deepstack_input_embedsz;Qwen3VLForConditionalGeneration._set_deepstack_input_embeds.  s    t5t<< 	F ,0033
3A6;;A>>>>+ + + + + t788+ + +D' 122 	 	C',[j[9??&s+   	 	rn   c                     t          | dd           sd S |dk    r>t          | j                  D ]+}| j        |         d |                                          *d S d S )NrT  r   )rd  r   r  rT  zero_)rk   r*  r  s      rm   _clear_deepstack_input_embedsz=Qwen3VLForConditionalGeneration._clear_deepstack_input_embedsC  s|    t5t<< 	F >>T566 F F+C0*=CCEEEE >F Frn   ry  c                     |                     dd           }|                     dd           }|                     dd           }||d S |t          d||          S |t          d||          S d S )Npixel_valuesimage_embedsr&  )typer  r&  )r  r  r&  )r}  rH   rF   )rk   ry  r  r  r&  s        rm   _parse_and_validate_image_inputz?Qwen3VLForConditionalGeneration._parse_and_validate_image_inputL  s     zz.$77zz.$77$4d;;L$84#-#)-    #1#)-    $#rn   c                    |                     dd           }|                     dd           }|                     dd           }|                     dd           }||d S |t          d|||          S |t          d||          S d S )Nr  video_embedsr  second_per_grid_ts)r  r  r  r  )r  r  r  )r}  rK   rI   )rk   ry  r  r  r  r  s         rm   _parse_and_validate_video_inputz?Qwen3VLForConditionalGeneration._parse_and_validate_video_inputd  s     %jj)>EEzz.$77$4d;;#ZZ(<dCC&<+?4*-*$7-#5	    #1#)-    $#rn   image_inputc                    |d         }|j         dk    sJ |d         dk    r&|d                             | j        j                  }nm|d                             | j        j                  }| j        r*t          | j        ||                                d          S |                     ||          }| j        j        }|                    d	          |z  |z                                  }|	                    |          S )
Nr&  r\   r  r  r  rope_3d	rope_typer   rr   )
ndimr  r  r   r   rX   rJ  r   r(  split)rk   r  r   r  r  r  sizess          rm   _process_image_inputz4Qwen3VLForConditionalGeneration._process_image_input~  s    /0}!!!!v.00&~6;;DK<MNNLL&~6;;DK<MNNL% L8Kx/@/@I     ${{<({KK [3
r""j0J>FFHH!!%(((rn   video_inputc                 
   |d         }|j         dk    sJ |d         dk    r&|d                             | j        j                  }no|d                             | j        j                  }| j        r,|                                }t          | j        ||d          S |                     ||          }| j        j        }|                    d	          |z  |z                                  }|	                    |          S )
Nr  r\   r  r  r  r  r  r  rr   )
r  r  r  r   r   rJ  rX   r   r(  r  )rk   r  r   r  r  rR  r  r  s           rm   _process_video_inputz4Qwen3VLForConditionalGeneration._process_video_input  s    /0}!!!!v.00&~6;;DK<MNNLL"-.C"D"I"I!# # % S ( 1 18K!4my     ${{+>{RR [3
r""j0J>FFHH!!%(((rn   image_embeds_splitc                 N   | j         j        }|d         }|                                }g }t          ||          D ]Z\  }}t	          ||                              |j                  }	t          j        ||	gd          }|	                    |           [|}t          |          S )a  
        Append mrope positions for each for images.
        This is necessary to recover correct mrope
        positions after video pruning

        Args:
            image_embeds_split: Tuple of image embeddings for
                each image item.
            image_input: Image input data.

        Returns:
            Tuple of image embeddings for each image item.
            Resulting embeddings will have extra 4 channels for
            computed mrope positions.
        r&  r<   r  )r  r   rJ  zipr%   r  r   r|   r  r#  rs  )
rk   r  r  r  r   rR  image_embeds_outembr  rQ  s
             rm   _postprocess_image_embeds_evsz=Qwen3VLForConditionalGeneration._postprocess_image_embeds_evs  s    ( [3
/0 ))/?? 	) 	)IC/jAADDSZPPI)S),!444C##C((((-'(((rn   video_embeds_splitc                 (   |d         }|j         dk    sJ |                                }| j        j        }|                    d          }|.t          j        t          |          t
          j                  }n|                                }t          | j
        j        dd          }g }t          |||          D ]F\  }	}
}t          |	|
| j        j        | j                  }t                              d	|	j        d
         |                                                                |
d
         |
d         |
d         | j        d|                                                                                                z
  dz             t-          |
|||                                                              |	j                  }|	|         }	||         }t          j        |	|gd          }	|                    |	           Ht7          |          S )a  
        Prunes video embeddings via Efficient Video Sampling (EVS)
        and then appends mrope positions for each retained embeddings

        Args:
            video_embeds_split: Tuple of video embeddings for each video item.
            video_input: Video input data.

        Returns:
            Tuple of video embeddings for each video item.
            Resulting embeddings will have extra 4 channels for
            computed mrope positions.
        r  r\   r  Nr  tokens_per_secondg      ?)r   r[  z\EVS: Video tokens pruned from %d to %d (T=%d,H=%d,W=%d, pruning_rate=%.2f, reduction=%.1f%%)r   r<   d   )r  video_second_per_gridr  )r  rJ  r  r   r  r|   onesr   r  rd  ro  r   r  r'   r8  r  debugrs   r   r  rp  meanr%   r  r   r  r#  rs  )rk   r  r  r   rR  r  r  r  video_embeds_outr  r  video_second_per_grid_tretention_maskrQ  s                 rm   _postprocess_video_embeds_evsz=Qwen3VLForConditionalGeneration._postprocess_video_embeds_evs  s    $ /0}!!!! ))[3
 )__-ABB% "'C,>,>ej!Q!Q!Q!3!8!8!:!:#DK$=?RTWXX25/A3
 3
 "	) "	).C. 4#';#A)	  N LL7	!""$$))++QQQ'^))++002277999S@
 
 
 0"3&=&B&B&D&D	  
 bnn  n%C!.1I)S),!444C##C((((%&&&rn   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)r  r  r  )r  r  r  r  )r  r  )rk   ry  mm_input_by_modality	input_keys       rm   %_parse_and_validate_multimodal_inputszEQwen3VLForConditionalGeneration._parse_and_validate_multimodal_inputs  s    ! 	 	I===#7770T0T 1 11 1$W- DDD#7770T0T 1 11 1$W- $#rn   input_tokensmm_featuresc              #     K   | j         j        }| j         j        j        }t	          |d           D ]Y}|j        j        }|j        dk    rI|j        d         j        	                                \  }}}	|dk    sJ d|             |||z  |	|z  fV  c|j        dk    r|j        d         j        	                                \  }}}	||z  }
|	|z  }t          | d	          o| j        d
uo
| j        dk    }|rE|                     |j        |          }||D ]}||z   |
|fV  t          d| j         d          t          |          D ]'}|                    ||          }||
|fV  ||
|z  z  }(Dt!          d|j                   d
S )a  
        Iterate over multimodal features and yield grid information.

        For videos with EVS (Efficient Video Sampling) enabled, this function
        computes the offset based on the pruned token count rather than relying
        on input_tokens.index(), which would fail when tokens are pruned.

        Args:
            input_tokens: List of token IDs in the prompt
            mm_features: List of multimodal feature specifications

        Yields:
            Tuple of (offset, grid_h, grid_w) for each frame/image
        c                     | j         j        S r   )mm_positionoffset)fs    rm   <lambda>zAQwen3VLForConditionalGeneration.iter_mm_grid_hw.<locals>.<lambda>:  s    AM<P rn   )keyr  r&  r<   zImage must have 1 frame, got r  r  r8  Nr6  zEVS is enabled (pruning_rate=z[) but is_embed mask is missing from mm_position. This indicates a bug in prompt processing.zUnsupported modality: )ro  rG  r   r   sortedr  r  rK  r'  rJ  r  r8   _extract_frame_offsets_from_maskr   r   rP  r  )rk   r  r  rG  r   
mm_featurer  r  r   r   
llm_grid_h
llm_grid_wis_evs_enabledframe_offsets
rel_offsetr   s                   rm   iter_mm_grid_hwz/Qwen3VLForConditionalGeneration.iter_mm_grid_hw'  sC     " 3![6I 2P2PQQQ *	Q *	QJ+2F"g--$/*:;@GGII1aAvvvBqBBvvva#55q<N7NNNNNN$//$/*:;@GGII1a"44
"44
 D"677 6/t;6/#5  " :$($I$I".% %M %0*7 N NJ"(:"5z:"MMMMM 
 'E8O E E E   #1XX : :!-!3!3NF!K!K$j*<<<<*z"99:
 !!O*:M!O!OPPPU*	Q *	Qrn   r  expected_framesc                 
   t          |dd          }|dS t          j        |t          j                                      d          }t          j        |d                                          }|                                dk    rdS |                                dk    r|g}nt          j        |          }t          j        |dk    d                                          }|                                dk    r|g}n:t          j	        ||
                    d                                                    }t          |          |k     r+t                              d	t          |          |           dS |d|         S )
a  Extract contiguous segments from EVS is_embed mask.

        The EVS (Efficient Video Sampling) mask marks which placeholder
        positions should be filled with video embeddings. This method splits
        the mask into contiguous segments, where each segment represents one
        retained frame.

        This is a pure function - it does not modify any state and always
        returns the same output for the same input (idempotent).

        Args:
            mm_position: MultiModal position containing the is_embed mask
            expected_frames: Expected number of frame segments

        Returns:
            List of tensors, each containing indices for one frame segment,
            or None if EVS is not enabled or validation fails.
        is_embedNr  rr   F)as_tupler   r<   z8EVS mask segments (%d) do not match expected frames (%d))rd  r|   	as_tensorr   rt   nonzeror   numeldifftensor_splitre  rJ  r   r  r  )	rk   r  r  is_embed_maskmask_tensortrue_indicessegmentsdiffssplit_pointss	            rm   _get_evs_mask_segmentsz6Qwen3VLForConditionalGeneration._get_evs_mask_segmentsf  sx   *  Z>> 4 om5:FFFKKBOO}[5AAAIIKK1$$4 1$$$~HHJ|,,E =!eDDDLLNNL!!##q(((> - ,"2"21"5"5"<"<">"> 
 x==?**LLJH  
 4(())rn   c                 N    |                      ||          }|dS d |D             S )a  Return relative offsets for each EVS-retained frame.

        The prompt processor stores a boolean mask inside ``mm_position`` that
        marks which placeholder locations should be populated with video
        embeddings. By splitting that mask into contiguous runs we can recover
        the start of every retained frame without probing ``input_tokens``.

        Args:
            mm_position: MultiModal position containing the is_embed mask
            expected_frames: Expected number of frames

        Returns:
            List of starting offsets (relative to mm_position) for each frame,
            or None if EVS is not enabled.
        Nc                 \    g | ])}t          |d                                                    *S )r   )r{   r  )r   segments     rm   r   zTQwen3VLForConditionalGeneration._extract_frame_offsets_from_mask.<locals>.<listcomp>  s.    ???7GAJOO%%&&???rn   r  rk   r  r  r  s       rm   r  z@Qwen3VLForConditionalGeneration._extract_frame_offsets_from_mask  s8    $ ..{OLL4??h????rn   c                 N    |                      ||          }|dS d |D             S )a  Return actual token count for each EVS-retained frame.

        This function calculates the actual number of tokens per frame by
        analyzing the is_embed mask, accounting for EVS pruning. Each frame
        may have a different token count due to content-aware pruning.

        Args:
            mm_position: MultiModal position containing the is_embed mask
            expected_frames: Expected number of frames

        Returns:
            List of token counts for each frame, or None if EVS is not enabled.
        Nc                 ,    g | ]}t          |          S r  r   )r   segs     rm   r   zRQwen3VLForConditionalGeneration._get_actual_frame_token_counts.<locals>.<listcomp>  s    ---SC---rn   r  r  s       rm   _get_actual_frame_token_countsz>Qwen3VLForConditionalGeneration._get_actual_frame_token_counts  s8      ..{OLL4--H----rn   r  multimodal_embeddingsmrope_positionsnum_computed_tokensc           	      V   | j         j        }| j         j        }| j         j        }t	          |          r|d         j        n|j        }t          j        ||t          j                  }	d |D             }
d |D             }t          |	||||||          \  }}t          |
          ||fS )a&  
        Update part of input mrope positions (starting with
        num_computed_tokens index). Original mrope_positions are computed
        for unpruned sequence and becomes incorrect once pruning occurs,
        so once we prune media tokens we should reflect this in the
        mrope_positions before we feed it to LLM.

        Args:
            input_ids: (N,) All input tokens of the prompt (Containing
                entire sequence).
            multimodal_embeddings: Tuple of multimodal embeddings.
            mrope_positions: Existing mrope positions (3, N) for entire
                sequence
            num_computed_tokens: A number of computed tokens so far.

        Returns:
            Tuple of (multimodal_embeddings, mrope_positions,
                mrope_position_delta).
        r   r  c                 *    g | ]}|d d d df         S )Nr  r   mms     rm   r   zMQwen3VLForConditionalGeneration.recompute_mrope_positions.<locals>.<listcomp>  s(    HHHBR3B3ZHHHrn   c                 v    g | ]6}|d d dd f                              dd                                          7S )Nr  r<   r   )r!  r  r  s     rm   r   zMQwen3VLForConditionalGeneration.recompute_mrope_positions.<locals>.<listcomp>  sQ     
 
 
02Bqqq"##vJq!$$))++
 
 
rn   )ro  r)  rG  rI  r   r   r|   r  r  r(   rs  )rk   r  r   r  r  r)  rG  rI  r   input_ids_tmm_embeddings_outmm_embeddings_posrQ  mrope_positions_deltas                 rm   r(   z9Qwen3VLForConditionalGeneration.recompute_mrope_positions  s    4 33 $ A
 ())(!!$++ ' 	 oiejQQQHH2GHHH
 
6K
 
 
 ,E!,
 ,
(	( &''4IIIrn   c                 Z   i }|D ]}|j         dk    rt          | d          o| j        d uo
| j        dk    }|r`|j        d         j                                        d         }|                     |j        |          }|
J d            |||j        j        <   g }d}	i }
|                     ||          D ]e\  }}}||	z
  }t          |          dk    r|d         
                                dz   nd}d }|D ]
}||k    r|}|u||v sJ d	| d
            ||
vrd|
|<   ||         }|
|         }|t          |          k     sJ d| dt          |           d            ||         }|
|xx         dz  cc<   n||z  }t          j        t          j        |          d|f          |z   }|                    |           ||z  }t          j        d||f                              dd          }|d d d |f         |z   }|                    |           ||z   }	g|	t          |          k     rt          |          dk    r|d         
                                dz   nd}t          |          |	z
  }t          j        t          j        |          d|f          |z   }|                    |           t          j        |d                              dd          }|
                                dz   t          |          z
                                  }t'          j        |          |fS )Nr  r8  r6  r  r   zGEVS enabled but failed to extract frame token counts from is_embed maskrr   r<   zFound base_offset z" but not in frame_token_counts_mapzEVS frame index z out of range (total frames: )r]   r   )rK  r  r8  r'  rJ  r  r  r  r  r   r  r   r   r   r#  r<  r   rM  r  r|   r   )rk   r  r  frame_token_counts_mapr  r  r  token_countsllm_pos_ids_liststframe_counts_idxr  r  r  text_lenst_idxbase_offsetfeat_offsetcountsr  actual_frame_tokenstext_positionsgrid_indicesframe_positionsfinal_text_positionsllm_positionsmrope_position_deltas                              rm   get_mrope_input_positionsz9Qwen3VLForConditionalGeneration.get_mrope_input_positions  s     "$% 	Y 	YJ"g--D"677 6/t;6/#5 
 " 	Y"(89>EEGGJA#'#F#F".$ $L (33- 433 MY*:+A+HI.2.B.B+/
 /
 0	. 0	.*FJ
 {H7:;K7L7Lq7P7P%b)--//!33VWF K5 . .[(("-K&"&<<<<XXXX =<< &66645$[1/<&{3S[[(((WsWWVWWW )(( '-Sk# ---2---- '1:&=# 	( 3 3a]CCfL  ##N333hF :q*j&ABBJJ1bQQL*111.B/B.B+BCfLO##O444 --BB L!!!!7:;K7L7Lq7P7P%b)--//!33VWF<((2-H	( 3 3a]CCfL ! ##$8999'7a@@@HHBOO - 1 1 3 3a 7#l:K:K KQQSS..0DDDrn   c                 n    | j         di |}|sd S d}|D ]}||         }|dk    rD|                     |          }| j        r|                     ||          }|t	          |          z  }|dk    rD|                     |          }| j        r|                     ||          }|t	          |          z  }|S )Nr  r  r  )r  r  r  r  rs  r  r  )rk   ry  r  r   rK  multimodal_inputimage_embeddingsvideo_embeddingss           rm   embed_multimodalz0Qwen3VLForConditionalGeneration.embed_multimodala  s   ItISSFSS# 	4 ;= - 	A 	AH3H=7""#'#<#<=M#N#N 5 '+'I'I(*:( ($ &/?)@)@@%7""#'#<#<=M#N#N 5 '+'I'I(*:( ($ &/?)@)@@%$$rn   rS  is_multimodalc                 (   d |D             }t          j        |d          }t          j        || j        | j        gd          \  }}t          j        ||d          }t          j        ||d          }|                    |                    d          | j        |                    d          z            }t          |||          }|	                    |j
        d         | j        | j                  }|                    ddd          }||fS )Nc                 ,    g | ]}t          |          S r  r  )r   ro   s     rm   r   zMQwen3VLForConditionalGeneration._compute_deepstack_embeds.<locals>.<listcomp>  s    ===!s1vv===rn   r   r  rr   r<   rS  r   r&  r\   )r|   r  r  r  r  	new_zerosr  r  rT   rt   rs   r!  )	rk   rS  r   r&  visual_lensmultimodal_embeddings_catmultimodal_embeddings_main multimodal_embeddings_multiscalerT  s	            rm   _compute_deepstack_embedsz9Qwen3VLForConditionalGeneration._compute_deepstack_embeds~  sV    >='<===$)I.C$K$K$K!
 K%_d12
 
 
	
&, !&&!
 !
 !
 ,1;,kq,
 ,
 ,
( "/!8!8q!!4#;m>P>PQR>S>S#S"
 "
 ">0"B'"
 "
 "

 "8!<!<"D$<do"
 "
 "8!?!?1a!H!H%'<<<rn   Fr&  handle_oov_mm_tokenr1  c                4   |                      || j        j        ||          }|t          |          dk    r|S t	          |          }| j        r|                     |||          \  }}nd }t          |||          }||                     |           |S )Nr0  r   r)  )	_embed_text_input_idsr  rb  r   rD   r  r/  rT   r  )rk   r  r   r&  r1  rS  rT  s          rm   rb  z/Qwen3VLForConditionalGeneration.embed_input_ids  s     22/' 3	 3 
 
 !(C0E,F,F!,K,K  .}== 
	* ..+&;+ /  &%% &*"4'"7'
 
 
 "-,,-CDDDrn   rQ  rR  c                 F   |d}|<t                      j        r)|                     |                    d                    }nd}| j                            |||||          }|;t                      j        r(|                     |                    d                     |S )a  Run forward pass for Qwen3VL.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen3VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
            intermediate_tensors: Intermediate tensors from previous pipeline
                stages.
            inputs_embeds: Pre-computed input embeddings.
            **kwargs: Additional keyword arguments including:
                - pixel_values: Pixel values to be fed to a model.
                    `None` if no images are passed.
                - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
                    LLM. `None` if no images are passed.
                - pixel_values_videos: Pixel values of videos to be fed to a
                    model. `None` if no videos are passed.
                - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
                    LLM. `None` if no videos are passed.
        Nr   rP  )r   r\  r  r  r  rl  r  )rk   r  rQ  rR  rS  ry  rT  rQ  s           rm   rw   z'Qwen3VLForConditionalGeneration.forward  s    @  + M$)E$%)%E%E""1%%& &"" &*"+11!5'#9 2 
 
 $)E$..}/A/A!/D/DEEErn   rQ  c                 6    | j                             |          S r   )r  compute_logits)rk   rQ  s     rm   r6  z.Qwen3VLForConditionalGeneration.compute_logits
  s     "11-@@@rn   r=  c                 X    t          |           }|                    || j                  S )N)mapper)rQ   ro  hf_to_vllm_mapper)rk   r=  loaders      rm   ro  z,Qwen3VLForConditionalGeneration.load_weights  s+    "4((""743I"JJJrn   c                 4    t          j        dddgd          S )z<
        Get the module prefix in multimodal models
        r  zvisual.mergerzvisual.deepstack_merger_listr  )r  	connectortower_model)r#   from_string_fieldr   s    rm   get_mm_mappingz.Qwen3VLForConditionalGeneration.get_mm_mapping  s-     /+&(FG!
 
 
 	
rn   num_image_tokensc                 <    | j         }|j        }|j        }||dz  z  S Nr\   ro  r   r   )rk   r@  r  r   r  s        rm   get_num_mm_encoder_tokensz9Qwen3VLForConditionalGeneration.get_num_mm_encoder_tokens  s*     K	!/"5
*a-//rn   r  c                 <    | j         }|j        }|j        }||dz  z  S rB  rC  )rk   r  r  r   r  s        rm   get_num_mm_connector_tokensz;Qwen3VLForConditionalGeneration.get_num_mm_connector_tokens(  s*     K	!/"5
 JM11rn   r   r  );rx   ry   rz   packed_modules_mappingsupports_encoder_tp_datarS   r9  classmethodr   r{   r  r   ri   rs  r  r  r8   r  r|   r}   r  r  r  rG   r  rJ   r  r  r  r  r  r`  r  rG  r*   r   r  r.   r  r  r  
LongTensorr(   r   r=   r%  r/  r   rb  rw   r6  r   rb  ro  r#   r?  rD  rF  r~   r   s   @rm   rw  rw    s4       
 
 
 
 w   $ &&1%<
 
   F3 F3 F3: F F F [F BI 0
 0
 0
z 0
3 0
 0
 0
 0
 0
 0
dC%S/ Cd C C C C4E#s(O 4 4 4 4

 
t	#
 
 
 
"%, SW    *F F F F F F		%   0		%   4)0)	u|S 	!) ) ) ),)0)	u|S 	!) ) ) )2)!%,"34) +) 
u|S 	!	) ) ) )>F'!%,"34F' +F' 
u|S 	!	F' F' F' F'P$f $ $ $ $ $&=Q I=Q489N4O=Q	%S#&	'=Q =Q =Q =Q~5*+5*>A5*	el	d	"5* 5* 5* 5*n@+@>A@	cT	@ @ @ @0.+.>A.	cT	. . . .,7J97J  %U\3%677J )	7J
 !7J 
uU\3&'s:	;7J 7J 7J 7Jr[E3i[E /0[E 
u|S 	!	[E [E [E [Ez% %4H44O % % % %:'=|'=  4'= |	'=
 
u|11	2'= '= '= '=X >B)
 .2$)) ) )<)  4d:)
 |d*) ") 
) ) ) )^ <@-16 6<6 <6 2D8	6
 |d*6 6 
+	+6 6 6 6pA|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
00 
0 0 0 022 
2 2 2 2 2 2 2 2rn   rw  )__doc__collections.abcr   r   r   r   r   	functoolsr   r	   	itertoolsr
   typingr   rK  r   r|   torch.nnr   torch.nn.functional
functionalr   transformersr   transformers.models.qwen2_vlr   6transformers.models.qwen2_vl.image_processing_qwen2_vlr   r  transformers.models.qwen3_vlr   r   3transformers.models.qwen3_vl.configuration_qwen3_vlr   r   6transformers.models.qwen3_vl.video_processing_qwen3_vlr  transformers.video_utilsr   vllm.compilation.decoratorsr   vllm.configr   vllm.config.multimodalr   r   vllm.distributedr   vllm.loggerr   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr    3vllm.model_executor.layers.vocab_parallel_embeddingr!   -vllm.model_executor.model_loader.weight_utilsr"   )vllm.model_executor.models.module_mappingr#   vllm.multimodalr$   vllm.multimodal.evsr%   r&   r'   r(   vllm.multimodal.inputsr)   r*   r+   r,   r-   r.   r/   vllm.multimodal.parser0   r1   r2   vllm.multimodal.processingr3   r4   r5   r6   r7   vllm.sequencer8   vllm.utils.collection_utilsr9   vllm.utils.math_utilsr:   #vllm.v1.attention.backends.registryr;   
interfacesr=   r>   r?   r@   rA   rB   rC   rD   
qwen2_5_vlrE   rF   rG   rH   rI   rJ   rK   qwen2_vlrL   rM   rN   qwen3rO   rP   utilsrQ   rR   rS   rT   rU   visionrV   rW   rX   rx   r  r  r   rZ   r   r   r   r   ru  r  r   rW  rj  register_processorrw  r  rn   rm   <module>rx     s  2 H G K K K K K K K K K K K K K K ( ( ( ( ( ( ( (                                 % % % % % % B B B B B B      Q P P P P P P P             3 2 2 2 2 2 = = = = = = " " " " " " F F F F F F F F ) ) ) ) ) ) # # # # # # F F F F F F 7 7 7 7 7 7        H G G G G G F F F F F F @ @ @ @ @ @ N N N N N N O O O O O O D D D D D D / / / / / /                             W V V V V V V V V V              . - - - - - 2 2 2 2 2 2 * * * * * * D D D D D D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	                          
 0 / / / / / / /                       
X		      RY   :" " " " "bi " " "J1 1 1 1 1	 1 1 1h1 1 1 1 1bi 1 1 1hg g g g gbi g g gT	` ` ` ` `1 ` ` `F     67L M   D
 
 
 
 
!89N!O 
 
 
D   !"#	 	  < < < < <J < < <~
 
 
 
 
* 
 
 
B ('	*  
u2 u2 u2 u2 u2Iu2 u2 
u2 u2 u2rn   