
    .`i:                     T   U d Z ddlmZmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlmZ ddlmc mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZV ddlmWZWmXZXmYZYmZZZ ddl[m\Z\  ee]          Z^ G d  d!eN          Z_ G d" d#eN          Z`e_e`z  Zaeebd$<    G d% d&ejc                  Zd G d' d(ejc                  Ze G d) d*ejc                  Zf G d+ d,ejc                  Zg G d- d.ejc                  Zh G d/ d0ejc                  Zid1eejejk        f         fd2Zl G d3 d4e=          Zm G d5 d6eA          Zn G d7 d8e?en                   Zo G d9 d:e@en                   Zp e0jq        epeneo;           G d< d=ejc        eSeReTeUeV                      ZrdS )>zDInference-only HunYuan-VL model compatible with HuggingFace weights.    )CallableIterableMappingSequence)partial)	AnnotatedAnyLiteral	TypeAliasN)BatchFeature)
VllmConfig)BaseDummyOptions)parallel_state)utils)init_logger)
get_act_fn)MMEncoderAttention)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)default_weight_loader)MultiModelKeys)MULTIMODAL_REGISTRY)	ImageItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItems	ImageSizeModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)HunYuanVLConfigHunYuanVLVisionConfig)HunYuanVLProcessor)smart_resize)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsQuantSupportsXDRoPE)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)is_vit_use_data_parallelc                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
HunYuanVLImagePixelInputsz
    Dimensions:
        - np: Number of patches
        - ni: Number of images
        - cps: Number of channels * patch_size * patch_size
    pixel_valuestypenpcpsni   image_grid_thwN
__name__
__module____qualname____doc__r
   __annotations__r   torchTensorr2        }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/hunyuan_vision.pyr@   r@   i   s           .
!!!!D%  	"   
 D!	     rQ   r@   c                       e Zd ZU dZed         ed<   eej         e	dd          f         ed<   eej         e	dd          f         ed<   d	S )
HunYuanVLImageEmbeddingInputszu
    Dimensions:
        - nf: Number of image features
        - hs: Hidden size
        - ni: Number of images
    image_embedsrB   nfhsrE   rF   rG   NrH   rP   rQ   rR   rT   rT   ~   s           .
!!!!D$	!   
 D!	     rQ   rT   HunYuanVLImageInputsc                        e Zd Zdej        ddfdedededeej	        gej	        f         de
dz  d	ef fd
Zdej	        fdZ xZS )HunYuanVisionMLPTN in_featureshidden_featuresbiasact_fnquant_configprefixc                     t                                                       t                      }t          ||||| d|          | _        t          ||||| d|          | _        || _        d S )Nz.dense_h_to_4h)r^   r`   ra   
disable_tpz.dense_4h_to_h)super__init__r>   r   dense_h_to_4hr   dense_4h_to_hr_   )	selfr\   r]   r^   r_   r`   ra   use_data_parallel	__class__s	           rR   re   zHunYuanVisionMLP.__init__   s     	4661%,,,(
 
 
 /%,,,(
 
 
 rQ   xc                     |                      |          \  }}|                     |                     |                    \  }}|S N)rf   rg   r_   )rh   rk   x_up_x_downs        rR   forwardzHunYuanVisionMLP.forward   sA    $$Q''a&&t{{4'8'899	rQ   )rI   rJ   rK   Fgeluintboolr   rN   rO   r   strre   rq   __classcell__rj   s   @rR   rZ   rZ      s        
 9:26   	
 %,56 )4/      :        rQ   rZ   c                   h     e Zd Z	 	 ddededededz  deddf fd	Zd
ej        dej        fdZ	 xZ
S )HunYuanVisionAttentionNr[   	embed_dim	num_headsprojection_sizer`   ra   returnc           
          t                                                       t                      }|rdnt          j                    | _        t          j        ||          | _        t          j        || j                  | _	        t          || j        ||d|| d|          | _        t          |||| d|          | _        | j        dz  | _        t          | j	        | j        | j        | d	          | _        d S )
Nr3   T.qkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsr^   r`   ra   rc   z.o_proj)
input_sizeoutput_sizer`   ra   rc         z.attnra   )rd   re   r>   r   $get_tensor_model_parallel_world_sizetp_size
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partitionr   qkvr   o_projscaler   attn)rh   r{   r|   r}   r`   ra   ri   rj   s          rR   re   zHunYuanVisionAttention.__init__   s4    	466 !GAADFF 	
 /9.?Y/
 /
+ 2<1Bt|2
 2
. %!9%(%???(	
 	
 	
 (&!%%%%(
 
 
 8$>
&2/J###	
 
 
			rQ   rk   c                     |                      |          \  }}|                    dd          \  }}}|                     |||          }|                     |          \  }}|S )NrF   dim)r   chunkr   r   )	rh   rk   r   ro   qkvoutoutputs	            rR   rq   zHunYuanVisionAttention.forward   sa     !Q))A2)&&1aii1a  KK$$	rQ   Nr[   )rI   rJ   rK   rt   r   rv   re   rN   rO   rq   rw   rx   s   @rR   rz   rz      s         370
 0
0
 0
 	0

 )4/0
 0
 
0
 0
 0
 0
 0
 0
d< 
       rQ   rz   c                        e Zd Zej        dddfdedededeej        gej        f         deege	j
        f         dz  dedz  d	ed
df fdZdej        d
ej        fdZ xZS )HunYuanVisionBlockNr[   r   r|   mlp_hidden_dimr_   
norm_layerr`   ra   r~   c           	      4   t                                                       |t          t          j        d          } ||          | _         ||          | _        t          ||||| d          | _        t          |||d|| d          | _
        d S )Ngư>epsz
.self_attn)r{   r|   r}   r`   ra   Tz.mlp)r_   r^   r`   ra   )rd   re   r   nn	LayerNorminput_layernormpost_attention_layernormrz   	self_attnrZ   mlp)	rh   r   r|   r   r_   r   r`   ra   rj   s	           rR   re   zHunYuanVisionBlock.__init__   s     	 4888J)z#(2
3%/%(((
 
 
 $%???
 
 
rQ   rk   c                     ||                      |                     |                    z   }||                     |                     |                    z   }|S rm   )r   r   r   r   )rh   rk   s     rR   rq   zHunYuanVisionBlock.forward  sQ     t33A6677766q99:::rQ   )rI   rJ   rK   rr   rs   rt   r   rN   rO   r   Moduler   rv   re   rq   rw   rx   s   @rR   r   r      s         :;8<26
 

 
 	

 %,56
 cUBI-.5
 )4/
 
 

 
 
 
 
 
>< 
       rQ   r   c                   d     e Zd Zdef fdZdej        deee                  dej        fdZ	 xZ
S )HunYuanVisionPatchEmbedconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        t          j
        |j        | j        | j        | j        d          | _        |j        | j        z  dz  | _        | j        dz   | _        t          | j        dz            | _        t          j        | j        | j                  | _        d | _        d S )NT)in_channelsout_channelskernel_sizestrider^      r3   g      ?)rd   re   r   r   r{   
patch_sizenum_channelsspatial_merge_sizeinterpolate_moder   Conv2dpatch_embeddingmax_image_sizemax_num_patchesnum_positionsrt   position_edge	Embeddingposition_embeddingpatch_pos_embed)rh   r   rj   s     rR   re   z HunYuanVisionPatchEmbed.__init__&  s    + +"/"("; & 7!y+? 
  
  
 !' 5 HQN!1A5 !3S!899"$,t/A4>"R"R#rQ   rA   grid_thwr~   c                    |                     d          }|                    || j        | j        | j                  }|                     |          }|                    d                              d                              d          }| j        md| j        | j        | j	        f}| j
        j        dd d d f                             |                              dddd                                          | _        g }|D ]}|\  }}	}
|	dz   |
dz   }
}	t          j                            | j        |	| j        z  |
| j        z  f| j        d          }|                    | j	        d                              dd                              d                              |j                  }|                    |           t-          j        |d	          }||z   }|S )
Nr   r   r3   rF   r   g?F)scale_factormodealign_cornersr   )sizereshaper   r   r   squeeze	unsqueezer   r   r{   r   weightpermutefloatr   
functionalinterpolater   	transposetodtypeappendrN   cat)rh   rA   r   num_patchespatch_embedspatch_pos_shapepatch_pos_embed_listgridro   h0w0r   
embeddingss                rR   rq   zHunYuanVisionPatchEmbed.forwardA  s    #''**#++*DOT_
 
 ++L99#++B//77;;EEaHH'""	O '.qrr111u5))Aq!$$	    " 	9 	9DIAr2 #XrCxB m77$ 4#55rD<N7NO*#	 8  O  '';;1a1L&''	  !''8888)$8a@@@!O3
rQ   )rI   rJ   rK   r.   re   rN   rO   listrt   rq   rw   rx   s   @rR   r   r   %  s}        $4 $ $ $ $ $ $61!L148cO1	1 1 1 1 1 1 1 1rQ   r   c                   .     e Zd Z	 	 	 d fd	ZddZ xZS )	HunYuanVisionPatchMergerr   h㈵>r[   c           	      (   t                                                       || _        |dz  }t          j        t          j        ||dz  ||          t          j                    t          j        |dz  |dz  d                    | _        t          j        |dz  |          | _	        t          j
        t          j        |dz            |z            | _        t          j
        t          j        |          |z            | _        t          j
        t          j        |          |z            | _        t          j
        t          j        |          |z            | _        t#          ||          | _        t#          ||          | _        d S )Nr   r   )r   r      r3   )r   r   )rd   re   r   r   
Sequentialr   GELUprojLinearr   	ParameterrN   randnimage_newlineimage_begin	image_end	image_sepr   
before_rms	after_rms)rh   r   r   r   rms_norm_epsra   	embed_stdrj   s          rR   re   z!HunYuanVisionPatchMerger.__init__v  sV    	"4 $&	MIa.)	   GIIIkAo{QAFFF	
 	
	 9[1_l;;\%+kAo*F*F*RSS<L(A(AI(MNNek,&?&?)&KLLek,&?&?)&KLL!+<@@@ <@@@rQ      r   c           	         |                      |          }|\  }}|j        }|                    ddd                              |j        d         d||          }|                     |          }|j        \  }}}}t          j        || j                            d|dd          	                    |||d          
                    |          gd          }|                    ||d                              ddd          }|                     |          }| j                            ddd          	                    |d|j        d                   
                    |          }| j                            ddd          	                    |d|j        d                   
                    |          }	t          j        |||	gd          }|                     |          S )Nr   r   r3   r   r   )r   r   r   r   shaper   rN   r   r   expandr   r   r   r   r   )
rh   rk   r   hwr   bcbeginends
             rR   rq   z HunYuanVisionPatchMerger.forward  s   OOA1IIaA&&qwqz2q!<<IIaLLW
1aI"**1aA66==aAqIILLUSST
 
 
 IIaB''1a00HHQKK ((Ar2299!QLLOOPUVVn$$Q2..55aAGBKHHKKERRIuao1---~~a   rQ   )r   r   r[   )r   )rI   rJ   rK   re   rq   rw   rx   s   @rR   r   r   u  sc        
 A A A A A A@! ! ! ! ! ! ! !rQ   r   c            	       
    e Zd Z	 	 ddededz  deddf fdZedej	        fd            Z	edej
        fd	            Z
d
ej        deee                  dej        fdZdeeeej        f                  dee         fdZ xZS )HunYuanVisionTransformerNr[   vision_configr`   ra   r~   c           	         t                                                       j        }j        | _        j        | _        j        | _        ddlm}  |d          5  t                    | _
        d d d            n# 1 swxY w Y   t          t          j        j                   |d          5  t          j        fdt!          |          D                       | _        d d d            n# 1 swxY w Y    |d          5  t%          j        j        j        j         d	          | _        d d d            d S # 1 swxY w Y   d S )
Nr   )set_model_tagr   r   r   c                     g | ]>}t          j        j        j        t	          j                   d |           ?S )z.layers.)r   r|   r   r_   r   r`   ra   )r   r   num_attention_headsintermediate_sizer   
hidden_act).0	layer_idxr   ra   r`   r   s     rR   
<listcomp>z5HunYuanVisionTransformer.__init__.<locals>.<listcomp>  sq        " ')5"/"C'4'F)-*BCC#-%1"(==)==    rQ   r   z	.perceive)r   r   ra   )rd   re   num_hidden_layersr   r  r|   r   vllm.compilation.backendsr  r   r   r   r   r   r   
ModuleListrangelayersr   out_hidden_sizeperceive)rh   r   r`   ra   r	  r  r   rj   s    ```  @rR   re   z!HunYuanVisionTransformer.__init__  sc    	);(4&:"/"B;;;;;;]455 	E 	E5mDDDO	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E 	E R\}/IJJJ
]/00 	 	-       &++<%=%=   DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  ]566 	 	4)-#0#C*7 +++  DM	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s6   "BB
B96C;;C?C?1EEEc                 .    | j         j        j        j        S rm   )r   r   r   r   rh   s    rR   r   zHunYuanVisionTransformer.dtype  s    .5;;rQ   c                 .    | j         j        j        j        S rm   )r   r   r   devicer  s    rR   r  zHunYuanVisionTransformer.device  s    .5<<rQ   rk   r   c           	      N   |                     d          }dg}|                    | j        | j                  }|                     ||          }|D ]K\  }}}t          |          t          |          t          |          }}}|                    ||z             Lt          j        |t          j	                  }t          j
        |dt          j	                  }|                    | j        d          }|                    |d          }|                    d          }d |D             }	| j        D ]=|                    |	d	
          }
fd|
D             }
t          j        |
d	
          }>|d	d          |d d         z
                                  }	|                    |	d	
          }g }t#          ||          D ]\\  }}|                    |                     |                                |d	d                                        d                     ]|S )Nr   )r  r   )r   )r   r   T)r  non_blockingr   c                 T    g | ]%\  }}}t          |          t          |          z  &S rP   )rt   )r  ro   r   r   s       rR   r  z4HunYuanVisionTransformer.forward.<locals>.<listcomp>  s.    CCCYaAQ#a&&CCCrQ   r3   r   c                 &    g | ]} |          S rP   rP   )r  players     rR   r  z4HunYuanVisionTransformer.forward.<locals>.<listcomp>  s!    ---!UU1XX---rQ   )r   )r   r   r  r   r   rt   r   rN   tensorint32cumsumr   r   r  splitr   tolistzipr  
contiguousr   )rh   rk   r   seq_len
cu_seqlenshidden_statestr   r   split_lengthspartssplit_itemsimage_embeds_listr   
split_itemr  s                  @rR   rq   z HunYuanVisionTransformer.forward  s/    &&))3
DKtzBBx@@ 	% 	%GAq!!ffc!ffc!ff!qAa!e$$$$\*EK@@@
\*!5;GGG
]]$+D]II
%--gr::%//22 DC(CCC[ 	4 	4E!''1'==E----u---E!Ie333MM $ABB*SbS/9AACC#))-Q)?? #Hk : : 	 	D*$$j3355DHEEMMaPP    ! rQ   weightsc                    g d}t          |                     d                    }t                      }|D ]\  }}|D ]>\  }}}	||vr|                    ||          }||         }
|
j        } ||
||	            n*||         }
t          |
dt                    } ||
|           |                    |           |S )N))r   z.q_projr   )r   z.k_projr   )r   z.v_projr   F)remove_duplicateweight_loader)dictnamed_parameterssetreplacer-  getattrr   add)rh   r*  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr-  s               rR   load_weightsz%HunYuanVisionTransformer.load_weights  s    "
 "
 "
 400%0HHII"%%%#* 	$ 	$D-5K 4 41
Kd**||K<<#D) % 3e]H===#D) '@U V Ve]333d####rQ   r   )rI   rJ   rK   r.   r   rv   re   propertyrN   r   r  rO   r   rt   rq   r   tupler0  r=  rw   rx   s   @rR   r   r     s<        37	+ +,+ )4/+ 	+
 
+ + + + + +Z <u{ < < < X< = = = = X=*!<*! tCy/*! 
	*! *! *! *!XHU33D-E$F 3s8        rQ   r   	hf_inputsc                    |                      dt          j        d                    }|                    d          }t	          t          j        d|          t          j        d|          t          j        dd                    S )NrG   )r   rF   r   imageT)keep_on_cpu)rA   rU   rG   )getrN   emptyprodr.  r    flat_from_sizesbatched)r@  rG   image_grid_sizess      rR   _hunyuan_vl_field_configrJ  +  s    ]]#3U[5H5HIIN%**2..*:7DTUU*:7DTUU,4W$OOO   rQ   c                   j     e Zd Zdeeej        f         ee         z  de	e
e
f         dz  f fdZ xZS )HunYuanVLMultiModalDataParserdatar~   Nc                     t          |t                    rt          |dddht                    S t	                                          |          S )NrB  rU   rG   )modalityrequired_fieldsfields_factory)
isinstancer.  r"   rJ  rd   _parse_image_data)rh   rM  rj   s     rR   rS  z/HunYuanVLMultiModalDataParser._parse_image_data6  sZ     dD!! 	% !/1A B7	    ww((...rQ   )rI   rJ   rK   r.  rv   rN   rO   r   r   r$   r	   rS  rw   rx   s   @rR   rL  rL  5  ss        /3$%Y(??/ 
38	$t	+/ / / / / / / / / /rQ   rL  c                       e Zd Zd ZdedefdZdedefdZdee	e
dz  f         fdZde
d	ee	e
f         dee	e
f         fd
Zdddde
de
de
dededz  deee
f         fdZde
de
dedz  de
fdZdefdZde
fdZdS )HunYuanVLProcessingInfoc                 @    | j                             t                    S rm   )ctxget_hf_configr-   r  s    rR   rX  z%HunYuanVLProcessingInfo.get_hf_configF  s    x%%o666rQ   kwargsr~   c                 ^     | j         j        t          fd|                    dd          i|S )Nuse_fastT)rW  get_hf_processorr/   poprh   rY  s     rR   r\  z(HunYuanVLProcessingInfo.get_hf_processorI  sE     )tx(
 
ZZ
D11
 
 
 	
rQ   c                 &     | j         di |j        S )NrP   )r\  image_processorr^  s     rR   get_image_processorz+HunYuanVLProcessingInfo.get_image_processorS  s      %t$..v..>>rQ   Nc                 
    dd iS )NrB  rP   r  s    rR   get_supported_mm_limitsz/HunYuanVLProcessingInfo.get_supported_mm_limitsY  s    rQ   r!  	mm_countsc                 8    |                                  }d}||dS )Nr   )rB  video)get_max_image_tokens)rh   r!  rd  max_image_tokensmax_video_tokenss        rR   get_mm_max_tokens_per_itemz2HunYuanVLProcessingInfo.get_mm_max_tokens_per_item\  s+    
  4466)4DEEErQ   r3   T)
num_frames	do_resizeimage_widthimage_heightrk  rl  r`  c                n   ||                                  }|                                 }|j        }|j        }|j        }	|r6t          ||||	z  |j        |j                  \  }
}t          ||
          }nt          ||          }d}|j	        |z  }|j
        |z  }||z  |	z  ||	z  dz   z  dz   }||fS )N)heightwidthfactor
min_pixels
max_pixels)rq  rp  r3   r   )ra  rX  r   r   r   r0   rs  rt  r#   rp  rq  )rh   rm  rn  rk  rl  r`  	hf_configr   r   r   resized_heightresized_widthpreprocessed_sizegrid_tgrid_hgrid_wnum_vision_tokenss                    rR   _get_vision_infoz(HunYuanVLProcessingInfo._get_vision_infof  s    ""6688O&&((	!/"-
*= 
	R,8#!!$66*5*5- - -)NM !*n U U U )L Q Q Q")Z7"(J6 VO11V?Q5QTU5UV 	
 !"333rQ   c                <    |                      |||          \  }}|S Nrm  rn  r`  r}  )rh   rm  rn  r`  ro   num_image_tokenss         rR   get_num_image_tokensz,HunYuanVLProcessingInfo.get_num_image_tokens  s5     #33#%+ 4 
 

  rQ   c                 <    |                      ddd           \  }}|S )Ni   i    r  r  )rh   r   ro   s      rR   !get_image_size_with_most_featuresz9HunYuanVLProcessingInfo.get_image_size_with_most_features  s2     11  2 
 

 rQ   c                 `    |                                  \  }}|                     ||d           S r  )r  r  )rh   target_widthtarget_heights      rR   rg  z,HunYuanVLProcessingInfo.get_max_image_tokens  s>    &*&L&L&N&N#m(($&  ) 
 
 	
rQ   )rI   rJ   rK   rX  objectr/   r\  ra  r   rv   rt   rc  rj  ru   r?  r#   r}  r  r  rg  rP   rQ   rR   rU  rU  E  s       7 7 7

 

 
 
 
?? 
? ? ? ?cDj)A    FF 38$F 
c		F F F F &4 &4 &4 &4 	&4
 &4 &4 ,d2&4 
y#~	&4 &4 &4 &4P    	 
 ,d2  
       9    
c 
 
 
 
 
 
rQ   rU  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	HunYuanVLDummyInputsBuilderrd  r~   c                 x    |                     dd          }| j                                        }|j        }||z  S )NrB  r   )rD  infor\  image_token)rh   rd  
num_imageshf_processorr  s        rR   get_dummy_textz*HunYuanVLDummyInputsBuilder.get_dummy_text  s;    ]]7A..
y1133'3Z''rQ   Nr!  
mm_optionsc                     |                     dd          }| j                                        \  }}d|                     |||          iS )NrB  r3   )rq  rp  r  )rD  r  r  _get_dummy_images)rh   r!  rd  r  r  r  r  s          rR   get_dummy_mm_dataz-HunYuanVLDummyInputsBuilder.get_dummy_mm_data  s\     ]]7A..
&*i&Q&Q&S&S#m T++"=Z ,  
 	
rQ   rm   )
rI   rJ   rK   r   rv   rt   r  r   r   r  rP   rQ   rR   r  r    s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rQ   r  c            
           e Zd ZdefdZdedeeef         deeef         deeef         def
dZ	de
d	eeef         d
edee         fdZded	eeef         deeef         fdZdS )HunYuanVLMultiModalProcessorr~   c                     t                      S rm   )rL  r  s    rR   _get_data_parserz-HunYuanVLMultiModalProcessor._get_data_parser  s    ,...rQ   promptmm_data	mm_kwargs
tok_kwargsc           	          | j         j                             | j         j        di |t	          dd|i|t	          di ||          S )NtextrP   )r  rW  call_hf_processorr\  r.  )rh   r  r  r  r  s        rR   _call_hf_processorz/HunYuanVLMultiModalProcessor._call_hf_processor  sd     y}..&DI&3333((f(((++9+
++
 
 	
rQ   mm_itemshf_processor_mm_kwargsout_mm_kwargsc                      | j         j        di |} | j         j        di |}d|j        i|j        dt
          dt          ffdfddD             S )NrB  item_idxrO  c                     	|         |          }|| d         j         }t          |t          j                  sJ |\  }}}t	          |          z  t	          |          z  dz   z  dz   }
|         g|z  S )N	_grid_thwr3   r   )rM  rR  rN   rO   rt   )r  rO  out_itemr   ro   rz  r{  
num_tokens
merge_sizer  placeholders           rR   get_replacement_hunyuan_vlzTHunYuanVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_hunyuan_vl  s    $X.x8H8 6 6 67<Hh55555 (Avvf++3Fz)A-J  )*Z77rQ   c           
      `    g | ]*}t          ||         gt          |                     +S ))rO  )rO  targetreplacement)r*   r   )r  rO  r  r  s     rR   r  zDHunYuanVLMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>  sY     
 
 
  !#H-.#$>RRR  
 
 
rQ   )rB  rP   )r  r\  ra  image_token_idr  rt   rv   )	rh   r  r  r  r  r`  r  r  r  s	      `  @@@rR   _get_prompt_updatesz0HunYuanVLMultiModalProcessor._get_prompt_updates  s     2ty1KK4JKK7$)7QQ:PQQ \0
 %/
		8 		8 		8 		8 		8 		8 		8 		8 		8 		8
 
 
 
 
 '
 
 
 	
rQ   r@  c                      t          |          S rm   )rJ  )rh   r@  r  s      rR   _get_mm_fields_configz2HunYuanVLMultiModalProcessor._get_mm_fields_config  s    
 (	222rQ   N)rI   rJ   rK   r&   r  rv   r   r  r   r  r%   r	   r!   r   r+   r  r    r  rP   rQ   rR   r  r    s       /"6 / / / /

 f%
 3;'	

 CK(
 

 
 
 
!
%!
 !(S 1!
 -	!

 
,	!
 !
 !
 !
F33 !(V 43 
++	,	3 3 3 3 3 3rQ   r  )r  dummy_inputsc                       e Zd Z edddd          ZdZdee         dee         de	j
        fd	Zed
edededz  fd            Zdddedef fdZdededz  fdZdedee	j
        df         fdZdedefdZdedefdZde	j
        de	j
        dedz  de	j
        dz  dede	j
        ez  fdZde	j
        de	j
        dz  fd Zd!eeee	j
        f                  dee         fd"Zde fd#Z! xZ"S )$!HunYuanVLForConditionalGenerationzvisual.zlanguage_model.model.)zvit.vit.zvit.zmodel.)orig_to_new_prefixTinput_tokensmm_featuresr~   c                 H   t          j        |dh          }d |                    dg           D             }| j        }|j        }|j        j        }t          |j        d                   }t          j
        |          }	t          j        |	|k                                  d          }
t          j        t          |	                    }t          j        t          |	                    }t          j        t          |	                    }t          j        t          |	                    }t          t          |
                    D ]}|
|         dz   }||         \  }}}|||z  ||z  }}}|dz   |z  }||||z                                t          j        d|dz                                 dd                              |d                              d                     ||||z                                t          j        d|                              dd                              d|dz                                 d                     |||||z   <   |dk    rt          j        ||||g          }n|d	k    rt          j        |||g          }|S )
NrG   c                 6    g | ]}|                                 S rP   )r  )r  items     rR   r  zPHunYuanVLForConditionalGeneration.get_xdrope_input_positions.<locals>.<listcomp>%  s     UUUD$++--UUUrQ   xdrope_sectionr3   r   r   r   r   rF   )r   gather_kwargsrD  r   image_start_token_idr   r   lenrope_scalingrN   r  argwherer   aranger  copy_r   r   stack)rh   r  r  rY  rG   ru  r  r   xd_numinput_tokens_tensorimage_start_indicesp_indexw_indexh_indext_indeximage_indexposr$  r   r   ro   
llm_grid_h
llm_grid_w	token_numllm_positionss                            rR   get_xdrope_input_positionsz<HunYuanVLForConditionalGeneration.get_xdrope_input_positions  s   
 '4
 
 VUFJJ?OQS4T4TUUUK	(=&4GY+,<=>>#l<88#n#77
 

'!** 	 ,s#67788,s#67788,s#67788,s#67788 %8!9!9:: 	9 	9K%k2Q6C$[1GAq!'''' &zA $a:5IC#	/)*00Q
Q//B
B''	   C#	/)*00Q
++QJN++	   .9GC#	/)**Q;;!K'7G(LMMMMq[[!K'7(CDDMrQ   rO  iNc                 N    |                     d          rdS t          d          )NrB  ul   <｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>z Only image modality is supported)
startswith
ValueError)clsrO  r  s      rR   get_placeholder_strz5HunYuanVLForConditionalGeneration.get_placeholder_strU  s4    w'' 	B B  B;<<<rQ   r[   r   vllm_configra   c          	         t                                                       |j        j        }|| _        |                     |dh          5  t          |j        |j        t          |d                    | _
        d d d            n# 1 swxY w Y   |                     |          5  t          |t          |d          ddg          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )NrB  visual)r`   ra   language_model.modelHunYuanDenseV1ForCausalLMHunYuanMoEV1ForCausalLM)r  ra   architectures)rd   re   model_configru  r   _mark_tower_modelr   r   r`   r=   r  _mark_language_modelr<   language_modelmake_empty_intermediate_tensors)rh   r  ra   r   rj   s       rR   re   z*HunYuanVLForConditionalGeneration.__init__\  s~   "-":"D##K';; 	 	2$(5#FH55  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<'#F,BCC/-# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s$   0BBB((CC #C rY  c                    |                     dd           }|                     dd           }|                     dd           }||d S t          |t                    rt          j        |d          }t          |j                  dk    r9|j        d         }|                    d|          }|                    dd          }|t          d||          S |t          d||	          S d S )
NrA   rU   rG   r   r   rF   r   )rB   rA   rG   )rB   rU   rG   )
r]  rR  r   rN   r   r  r   r   r@   rT   )rh   rY  rA   rU   rG   last_dims         rR   _parse_and_validate_image_inputzAHunYuanVLForConditionalGeneration._parse_and_validate_image_inputw  s    zz.$77zz.$77$4d;;L$84 lD)) 	: 9\q999L|!""a''#)"-H'//H==L+33B::N#,#)-    #0#)-    $#rQ   image_input.c                     |d         }|j         dk    sJ |                                }|d         dk    r&|d                             | j        j                  }n|d         }|                     ||          }|S )NrG   r   rB   rU   rA   )r   )ndimr  rB   r  r   )rh   r  r   grid_thw_listrU   rA   s         rR   _process_image_inputz6HunYuanVLForConditionalGeneration._process_image_input  s     /0}!!!! ))v.00&~6;;DK<MNNLL&~6L  ;;|m;LLLrQ   c                 D    i }|D ]}|dv rd|vr | j         di ||d<   |S )N)rA   rU   rB  rP   )r  )rh   rY  mm_input_by_modality	input_keys       rR   %_parse_and_validate_multimodal_inputszGHunYuanVLForConditionalGeneration._parse_and_validate_multimodal_inputs  sa    !   	 	I===#7770T0T 1 11 1$W- $#rQ   c                      | j         di |}|sg S d}|D ]7}||         }|dk    r'|                     |          }|t          |          z  }8|S )NrP   rB  )r  r  r?  )rh   rY  r  multimodal_embeddingsrO  multimodal_inputimage_embeddingss          rR   embed_multimodalz2HunYuanVLForConditionalGeneration.embed_multimodal  s    ItISSFSS# 	I ;= - 	A 	AH3H=7""#'#<#<=M#N#N %/?)@)@@%$$rQ   	input_ids	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r  r  r  r  )r  )rh   r  r  r  r  rY  r#  s          rR   rq   z)HunYuanVLForConditionalGeneration.forward  s=      + M++!5'	 , 
 
 rQ   r#  c                 6    | j                             |          S rm   )r  compute_logits)rh   r#  s     rR   r  z0HunYuanVLForConditionalGeneration.compute_logits  s     "11-@@@rQ   r*  c                 z    t          | | j        j        rdgnd           }|                    || j                  S )Nzlm_head.)skip_prefixes)mapper)r:   r   tie_word_embeddingsr=  hf_to_vllm_mapper)rh   r*  loaders      rR   r=  z.HunYuanVLForConditionalGeneration.load_weights  sK    "+/;+JTJ<<PT
 
 
 ""743I"JJJrQ   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r  zvisual.perceiver  )r  	connectortower_model)r   from_string_fieldr  s    rR   get_mm_mappingz0HunYuanVLForConditionalGeneration.get_mm_mapping  s'     /1' 
 
 
 	
rQ   )#rI   rJ   rK   r;   r  supports_encoder_tp_datar   rt   r   rN   rO   r  classmethodrv   r  r   re   r  rX   r  r?  r  r.  r  r4   r  r,   rq   r  r   r0  r=  r   r	  rw   rx   s   @rR   r  r    s        & "-	
 
    $73i7 /07 
	7 7 7 7r =3 =3 =3: = = = [= BD 
 
 
z 
3 
 
 
 
 
 
6		$   @/	u|S 	!   "$f $ $ $ $ $% %4H % % % %$< < 2D8	
 |d*  
+	+   &A|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
 
 
 
 
rQ   r  )srL   collections.abcr   r   r   r   	functoolsr   typingr   r	   r
   r   rN   torch.nnr   torch.nn.functionalr   rr   transformersr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r    r!   vllm.multimodal.parser"   r#   r$   r%   r&   vllm.multimodal.processingr'   r(   r)   r*   r+   vllm.sequencer,   *vllm.transformers_utils.configs.hunyuan_vlr-   r.   -vllm.transformers_utils.processors.hunyuan_vlr/   3vllm.transformers_utils.processors.hunyuan_vl_imager0   vllm.utils.tensor_schemar1   r2   
interfacesr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   visionr>   rI   loggerr@   rT   rX   rM   r   rZ   rz   r   r   r   r   rv   rO   rJ  rL  rU  r  r  register_processorr  rP   rQ   rR   <module>r*     s  2 K J J A A A A A A A A A A A A       5 5 5 5 5 5 5 5 5 5 5 5                 % % % % % % " " " " " " 3 3 3 3 3 3 + + + + + + 0 0 0 0 0 0 # # # # # # < < < < < < X X X X X X 8 8 8 8 8 8         
 G F F F F F O O O O O O D D D D D D / / / / / /                                          . - - - - -        M L L L L L L L L L L L > > > > > > > >                           - , , , , ,	X		
       *    L   ,  == i   ! ! ! ! !ry ! ! !H; ; ; ; ;RY ; ; ;|& & & & & & & &RM M M M Mbi M M M`5! 5! 5! 5! 5!ry 5! 5! 5!p{ { { { {ry { { {|U\0A(B    / / / / /$8 / / / e
 e
 e
 e
 e
0 e
 e
 e
P
 
 
 
 
"89P"Q 
 
 
493 93 93 93 93#:;R#S 93 93 93x (' 	 ,  
i
 i
 i
 i
 i
Ii
 i
 
i
 i
 i
rQ   