
    .`iƜ                     n   U d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG ddlHmIZI dZJdZKeLeMd<   dZNeLeMd<   	 dZd!eOd"ePeOeOf         d#eOfd$ZQ G d% d&e9          ZReRZS G d' d(e9          ZTeTZU G d) d*e2          ZV G d+ d,e0eV                   ZW G d- d.e1eV                   ZXd/e3d#eVfd0ZYdd1d2eVd3e0eV         d4e'dz  d#e1fd5ZZddd6d7d8e#dz  d9eOdz  d:e[dz  d;eLd#e<eBz  f
d<Z\ G d= d>ej]                  Z^ G d? d@ej]                  Z_ e%j`        eZeYeWA           G dB dCej]        e?e@                      ZadDejb        dEePeOeOf         d#ejb        fdFZcdEePdGedd#ePfdHZedIePeOeOf         dJeLedePeOeOf                  z  dKeOd#ePeOeOf         fdLZfdMejb        dNeOdOeOdIePeOeOf         dGedePeOeOf                  dPeOdQe[dRejb        d#ejb        fdSZg	 	 d[dVedejb                 dWededeO                  dGedePeOeOf                  dKeOdPeOdRejb        dXeOdQe[d#edejb                 fdYZhdS )\    N)defaultdict)IterableMappingSequence)partial)
accumulate)	AnnotatedLiteral)	rearrange)	LayerNormLayerNorm2d)RegStage)BatchFeatureCLIPVisionConfigSiglipVisionConfig)
VllmConfig)BaseDummyOptions)QuantizationConfig)MULTIMODAL_REGISTRY)BaseMultiModalProcessorCache)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoInputProcessingContextPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)SiglipVisionModel)AutoWeightsLoader
flatten_bninit_vllm_registered_modelmaybe_prefix)get_vision_encoder_infoz<|endofturn|>z
<|dummy3|>IMAGE_TOKENz<|_unuse_missing_100270|>VIDEO_TOKEN   r3   
num_framesmax_grid_shapereturnc                 J    |d         |d         z  }| |z  }| |z  }||dk    z   S )Nr   r%    )r4   r5   max_num_gridsnum_canvasesleftover_framess        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/hyperclovax_vision.pyget_num_combined_framesr=   ;   s>     #1%q(99M .L =0O?Q.//    c            
           e Zd ZU dZdZed         ed<   eee	j
                  eddddddh	          f         ed
<   ee	j
         edd          f         ed<   dS )HCXVisionImagePixelInputsz
    Dimensions:
        - n: Number of images
        - g: Number of grids
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypengr3   hwdynamic_dimspixel_values_images   image_sizes_imagesN__name__
__module____qualname____doc__rB   r
   __annotations__r	   listtorchTensorr$   r8   r>   r<   r@   r@   H   s           %3D'.
!222"U\KKS!S#SERRRR    "%,C0C0C"CDDDDDDr>   r@   c                       e Zd ZU dZdZed         ed<   eeee	j
                           edddddd	ddh
          f         ed<   dS )HCXVisionVideoPixelInputsz
    Dimensions:
        - n: Number of videos
        - f: Number of frames
        - g: Number of grids
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_values_videosrB   rC   frD   r3   rE   rF   rG   NrL   r8   r>   r<   rV   rV   \   s           ,AD''
(@@@"T%, Cc1c3c3ZHHH	J     r>   rV   c                       e Zd Zd Zdeeedz  f         fdZdeee         z  defdZ	deee         z  defdZ
defdZdefd	ZdS )
HCXVisionProcessingInfoc                 D    t          |                                           S N)r/   get_hf_configselfs    r<   r/   z/HCXVisionProcessingInfo.get_vision_encoder_infor   s    &t'9'9';';<<<r>   r6   Nc                     d d dS )Nimagevideor8   r^   s    r<   get_supported_mm_limitsz/HCXVisionProcessingInfo.get_supported_mm_limitsu   s    ---r>   vision_query_lengthc                N    t          |t                    r|S t          |          S r\   
isinstanceintsumr_   re   s     r<   get_num_image_tokensz,HCXVisionProcessingInfo.get_num_image_tokensx   ,    
 )3// 	,&&*+++r>   c                N    t          |t                    r|S t          |          S r\   rg   rk   s     r<   get_num_video_tokensz,HCXVisionProcessingInfo.get_num_video_tokens   rm   r>   c                 x    |                                  }|                                x}}t          ||          S )N)widthheight)r/   get_image_sizer   )r_   vision_encoder_inforq   rr   s       r<   !get_image_size_with_most_featuresz9HCXVisionProcessingInfo.get_image_size_with_most_features   s=    "::<<,;;===uV4444r>   c                 ^    |                                  \  }}|                     ||          S )N)image_widthimage_height)ru   rl   )r_   target_widthtarget_heights      r<   get_max_image_tokensz,HCXVisionProcessingInfo.get_max_image_tokens   s;    &*&L&L&N&N#m(($& ) 
 
 	
r>   )rM   rN   rO   r/   r   strri   rd   rR   rl   ro   r   ru   r{   r8   r>   r<   rZ   rZ   q   s        = = =.cDj)A . . . ., !49_, 
	, , , ,, !49_, 
	, , , ,59 5 5 5 5

c 
 
 
 
 
 
r>   rZ   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	HCXVisionDummyInputsBuilder	mm_countsr6   c                     t           |                    dd          z  t          |                    dd          z  z   }|S )Nrb   r   rc   )r0   getr1   )r_   r   
dummy_texts      r<   get_dummy_textz*HCXVisionDummyInputsBuilder.get_dummy_text   sF     !9==Q$
 $
 
)--3334
 r>   Nseq_len
mm_optionsc                 n   |                     dd          }|                     dd          }| j                                        \  }}d}|r|                     d          nd }	|r|                     d          nd }
|                     ||||	          |                     |dz
  |dz
  |||
          dS )	Nrb   r   rc       )rq   rr   
num_images	overridesr%   )rq   rr   r4   
num_videosr   ra   )r   inforu   _get_dummy_images_get_dummy_videos)r_   r   r   r   r   r   ry   rz   target_num_framesimage_overridesvideo_overridess              r<   get_dummy_mm_dataz-HCXVisionDummyInputsBuilder.get_dummy_mm_data   s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m5?I*..111T5?I*..111T ++"$%)	 ,   ++"Q&$q(,%) ,  
 
 	
r>   r\   )
rM   rN   rO   r   r|   ri   r   r   r   r   r8   r>   r<   r~   r~      s        38$ 
    =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r>   r~   c            
           e Zd Zdedeeef         deeef         deeef         def
dZdeded	eeef         d
eeef         de	f
dZ
ded	eeef         dedee         fdZded	eeef         deeef         fdZdS )HCXVisionMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr6   c                    t          |                    dg                     D ]B\  }}|j        t          j        k    r(|                    t          j                  |d         |<   C| j        j                             | j        j	        di |t          |d d                     }t          |          dk    r|                    d          }|                    d          }	| j        j                             | j        j	        di |t          d |d n|g|	d n|	g                                                    D ]M\  }
}t          |t                    r3t          |          dk    r t          |          dk    sJ |d         |
<   N|r:t          j        d                   d<   t          j        d                   d<   |	rtdgt#          d	 |	D                       fd
t%          t          |	                    D             d<   fdt%          t          |	                    D             d<   |                               |S )Nvideos)textimagesr   )hf_processordatar   r   r%   rK   vision_query_lengths_imagesc              3   N   K   | ] }t          t          |                    V  !d S r\   )r=   len.0rc   s     r<   	<genexpr>zBHCXVisionMultiModalProcessor._call_hf_processor.<locals>.<genexpr>   sB          @E/E

;;           r>   c                 N    g | ]!}d          |         |dz                     "S )rW   r%   r8   r   i_idx_per_video_processed_outputss     r<   
<listcomp>zCHCXVisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>   sL     = = =  ''<=&q)N1q5,AA= = =r>   rW   c           	      r    g | ]3}t          j        d          |         |dz                               4S )vision_query_lengths_videosr%   )rS   tensorr   s     r<   r   zCHCXVisionMultiModalProcessor._call_hf_processor.<locals>.<listcomp>  sa     E E E  L*+HI*1-q1u0EE E E Er>   r   r8   )	enumerater   dtypenpuint8astyper   ctxcall_hf_processorget_hf_processordictr   itemsrh   rR   rS   r   r   rangeupdate)r_   r   r   r   r   	video_idx	video_arrprocessed_outputsr   r   kvr   r   s               @@r<   _call_hf_processorz/HCXVisionMultiModalProcessor._call_hf_processor   s    %.gkk(B.G.G$H$H 	J 	J Iy"(**/8/?/?/I/I!), IM;;33@@i@@   < 
 
 w<<![[**F[[**F "&!@!@7TY7DD)DD#)>44x#)>44x   "A " " +0022 1 11a&& 13q66A::q66Q;;;;,-aD&q) ;@<&';<< <"#78 EJL&'DEE E"#@A  "    IO       "= = = = = #3v;;//	= = ="#89E E E E E #3v;;//E E E"#@A $$%7888  r>   prompt_textmm_itemshf_processor_mm_kwargstokenization_kwargsc                     dS )NFr8   )r_   r   r   r   r   s        r<   _hf_processor_applies_updatesz:HCXVisionMultiModalProcessor._hf_processor_applies_updates  s	     ur>   out_mm_kwargsc                       j                                         }|j        |j        ddt          dt
          dt          f fdfddD             S )Nra   item_idxmodalityr   c                 Z   ||         |          }|dk    r;|d         j                                         }j                            |          }nP|dk    r;|d         j                                         }j                            |          }nt          |          |         g|z  S )Nrb   r   )re   rc   r   )r   tolistr   rl   ro   NotImplementedError)r   r   r   out_itemlens
num_tokensplaceholderr_   s         r<   get_replacement_hyperclovaxzUHCXVisionMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_hyperclovax#  s    
 %X.x8H7"" =>CJJLL!Y;;PT;UU

W$$ =>CJJLL!Y;;PT;UU

)(333)*Z77r>   c                 b    g | ]+}t          ||         gt          |                     ,S ))r   r   )r   targetreplacement)r    r   )r   r   r   r   r   s     r<   r   zDHCXVisionMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>5  sg     
 
 
  !) $/%"/  
 
 

 
 
r>   )r   r]   image_token_idvideo_token_idri   r|   r   )r_   r   r   r   	hf_configr   r   s   `  ` @@r<   _get_prompt_updatesz0HCXVisionMultiModalProcessor._get_prompt_updates  s     I++--	--
 

	8	8	8 1	8 	8 	8 	8 	8 	8 	8$
 
 
 
 
 
 /
 
 
 	
r>   	hf_inputsc           	          t          t          j        d          t          j        d          t          j        d          t          j        d          t          j        d                    S )Nrb   rc   )rI   rK   r   rW   r   )r   r   batched)r_   r   r   s      r<   _get_mm_fields_configz2HCXVisionMultiModalProcessor._get_mm_fields_configD  sa    
  5 =g F F4<WEE(=(Eg(N(N 5 =g F F(=(Eg(N(N
 
 
 	
r>   N)rM   rN   rO   r|   r   objectr   r   r   boolr   r   r   r!   r   r   r   r8   r>   r<   r   r      s`       G!G! f%G! 3;'	G!
 CK(G! 
G! G! G! G!R & !(V 4	
 %S&[1 
   +
%+
 !(V 4+
 -	+

 
,	+
 +
 +
 +
Z

 !(V 4
 
++	,	
 
 
 
 
 
r>   r   r   c                      t          |           S r\   )rZ   )r   s    r<   _build_hcxvision_hf_infor   R  s     #3'''r>   cacher   dummy_inputsr   c                    t          | t                    rt          | ||          S t          t	          |                     )Nr   )rh   rZ   r   r   rB   )r   r   r   s      r<   _build_hcxvision_hf_processorr   X  sM     $/00 
+
 
 
 	
 d4jj
)
))r>    )use_nth_layerrequire_post_normprefixquant_configr   r   r   c                J   | j         }t          |t                    sn|dk    r|dz   }n||z   dz   }t          | t                    rt	          | ||||          S t          | t
                    rt          | ||||          S dt          |            }t          |          )Nr   r%   )r   num_hidden_layers_overrider   r   zUnsupported vision config: )	num_hidden_layersrh   ri   r   r&   r   r*   rB   r   )vision_configr   r   r   r   r   msgs          r<   init_vision_tower_for_hcxvisionr   h  s     &7mS)) B	!		)A--=A-!122 
%'8/
 
 
 	
 
M#5	6	6 
 %'8/
 
 
 	
 >](;(;
=
=C
c
"
""r>   c                   6     e Zd Zddej        f fd	Zd Z xZS )HCXVisionMlpNc                    t                                                       |p|}|p|}|| _        | j        dk    rEt          j        ||          | _         |            | _        t          j        ||          | _        d S | j        dk    rKt          j        |d|z            | _         |            | _        t          j        d|z  |          | _        d S t          d	                    | j                            )Nmlpinverted_mlprJ   z{} is not implemented)
super__init__mm_projector_typennLinearfc1actfc2r   format)r_   r   in_featureshidden_featuresout_features	act_layer	__class__s         r<   r   zHCXVisionMlp.__init__  s     	#2{)8[!2!U**yo>>DH y{{DHy,??DHHH#~55ya/.ABBDH y{{DHy_!4lCCDHHH%'..t/EFF  r>   c                     |                      |          }|                     |          }|                     |          }|S r\   )r   r   r   )r_   xs     r<   forwardzHCXVisionMlp.forward  s4    HHQKKHHQKKHHQKKr>   )rM   rN   rO   r   GELUr   r  __classcell__r  s   @r<   r   r     sY        
 '     2      r>   r   c                       e Zd ZdZ	 	 ddededededed	ed
ef fdZ	 	 ddej        de	e	e                  dz  de	e         dz  dej        fdZ
	 	 ddej        de	e	e                  dz  de	e         dz  dej        fdZ	 	 ddej        de	e	e                  dz  de	e         dz  de	ej                 fdZ	 	 ddedededededefdZdededefdZ xZS )HCXVisionCAbstractorz
    This module is based on C-Abstractor, whose license is under apache-2.0.
    You can check the original code at
    https://github.com/khanrc/honeybee/blob/main/honeybee/projectors/projectors.py
    and we made necessary modifications.
    TFnum_queriesnum_input_tokensencoder_hidden_sizehidden_sizeoutput_hidden_sizepos_embprenormc                    t                                                       || _        || _        |rZt          j                            t	          j        d||                    | _        | j        j	        
                    dd           nd | _        |rt          |          | _        nd | _        |                     ||||           t          |                                           j        | _        d S )Nr%   g        g{Gz?)meanstd)r   r   r  r  rS   r   	Parameterzerosr  r   normal_r   r  	build_netnext
parametersr   )	r_   r  r  r  r  r  r  r  r  s	           r<   r   zHCXVisionCAbstractor.__init__  s     	 0"4  	  8--A/1DEE DL L%%3D%9999DL  	 $%899DLLDL,k;M	
 	
 	
 $//++,,2


r>   Nr  num_queries_vis_abstractors	num_gridsr6   c                     | j         |                      |          }| j        
|| j        z   }|                     |||          }|S )N)r  r  )r  r  _forward)r_   r  r  r  s       r<   r  zHCXVisionCAbstractor.forward  sW     <#QA<#DL AMM(C  
 
 r>   c                    |j         \  }}}t          |dz            }t          |d||          }||J |                     |||          S |                     |          }t          |d          }|                     |          }|S )N      ?zb (h w) d -> b d h w)rE   rF   b d h w -> b (h w) d)shaperi   r   _forward_adaptive_num_querynetreadout)r_   r  r  r  BLdimhws           r<   r  zHCXVisionCAbstractor._forward  s     G	1cC[[a/2<<<&2(((33.	   HHQKKa/00LLOOr>   c                    t          | j                  dk    sJ  | j        d         |          }g }t          |          D ]\  }}t          |dz            }t	          j        ||f          } ||||         ||dz            d d f                   }	 | j        d         |	          }	t          |	d          }	|                     |	          }	|                    |	           |S )Nr3   r   r   r%   rJ   r!  )	r   r$  r   ri   r   AdaptiveAvgPool2dr   r%  append)
r_   r  r  r  new_xr   r  r)  samplerouts
             r<   r#  z0HCXVisionCAbstractor._forward_adaptive_num_query  s     48}}!!!!DHQKNN'(CDD 		 		NA{[#%&&B*B844G'!IaL9QU+;;QQQ>?@@C$(1+c""CC!788C,,s##CLLr>   r3   rJ   	n_queriesdepth	mlp_depthc                    |dz                                   sJ d|             t          |dz            }t          t          ddt          j        t                    } ||||          }	t	          j        ||f          }
 ||||          }t	          j        |	|
|          | _	        | 
                    |||          | _        d S )Nr   z,n_queries must be square number. n_queries: r%   )stridedilationr  
norm_layer)
is_integerri   r   r   r   SiLUr   r+  
Sequentialr$  	build_mlpr%  )r_   r0  r  r  r  r1  r2  r)  RegBlocks1r.  s2s               r<   r  zHCXVisionCAbstractor.build_net  s     3**,, 	
 	
F9FF	
 	
, C   g"
 
 
 X
 

 &Bx00X
 
 =Wb11~~i>PQQr>   c                    t          j        ||          g}t          d|          D ]P}|                    t          j                               |                    t          j        ||                     Qt          j        | S )Nr%   )r   r   r   r,  r8  r9  )r_   r1  r  r  layers_s         r<   r:  zHCXVisionCAbstractor.build_mlp?  sz     )K);<<=q% 	M 	MAMM"'))$$$MM")$68JKKLLLL}f%%r>   )TFNN)r3   rJ   )rM   rN   rO   rP   ri   r   r   rS   rT   rR   r  r  r#  r  r:  r  r  s   @r<   r
  r
    s[          3  3 3  3 !	 3
  3   3  3  3  3  3  3  3  3J ?C&*	 < &*$s)_t%; 9t#	
 
   . ?C&*	 < &*$s)_t%; 9t#	
 
   2 ?C&*	 < &*$s)_t%; 9t#	
 
el	   : $R $R$R !$R 	$R
  $R $R $R $R $R $RL
&
& 
&  	
& 
& 
& 
& 
& 
& 
& 
&r>   r
  )r   r   c                       e Zd Zg dddgdZdddeded	d
f fdZededed	ed
z  fd            Z	de
d	ed
z  fdZde
d	ed
z  fdZded	eej        df         fdZded	eej        df         fdZde
d	efdZde
d	efdZ	 	 d*dej        dej        ded
z  dej        d
z  de
d	ej        ez  fdZdeej                 dej        d	eej        df         fd Zd!eeej                          d	eej        df         fd"Zde
fd#Zd$ej        d	ej        d
z  fd%Zd&eeeej        f                  d	e e         fd'Z!d( Z"d) Z# xZ$S )+HCXVisionForCausalLM)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   )r   vllm_configr   r6   Nc                   t                                                       |j        j        }|j        }|j        }|j        dv rd|_        |j        dk    rd|_        |j	        }i |_
        |j        |_        |j        |_        |j        j        | _        |                     ||          |_        |                     |ddh          5  t#          ||t%          |dd          d	t'          |d
                    | _        |                     |||          | _        |j        r7t/          j        t3          j        |j        | j                            | _        d d d            n# 1 swxY w Y   |                     |          5  t=          ||t'          |d                    | _        d d d            n# 1 swxY w Y   || _         || _	        || _        d S )N)gpt2hyperclovaxllamasdparN  g      ?rb   rc   r   Fvision_model)r   r   r   r   r   language_model)rK  r   r   )!r   r   model_configr   r   text_config
model_type_attn_implementationlogits_scalingr   auto_mapanyresr9   r   _init_possible_resolutionspossible_resolutions_mark_tower_modelr   getattrr.   rR  _init_mm_projectormm_projectorr   r  rS   emptyr  image_newline_mark_language_modelr-   rT  config)r_   rK  r   re  r   rV  r   r  s          r<   r   zHCXVisionForCausalLM.__init__W  so    )3"/(!%EEE/5K,!]22),K&,!#%}&,&:# -3
 '+&E&EM'
 '
# ##K'71CDD 	 	 ?)%forBB"'#FN;;! ! !D !% 7 7]! !D } %'\K 7tzJJJ& &"	 	 	 	 	 	 	 	 	 	 	 	 	 	 	" &&{33 	 	"<'%#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 *&s%   BE%%E),E)&F77F;>F;r   r   c                     |                     d          rt          S |                     d          rt          S t          d          )Nrb   rc   z)Only image or video modality is supported)
startswithr0   r1   
ValueError)clsr   r   s      r<   get_placeholder_strz(HCXVisionForCausalLM.get_placeholder_str  sI    w'' 	w'' 	DEEEr>   kwargsc                     |                     dd           }|d S |                     d          }t          ||          S NrI   rK   )rI   rK   )popr@   )r_   rk  rI   rK   s       r<   _parse_and_validate_image_inputz4HCXVisionForCausalLM._parse_and_validate_image_input  sT     %jj)>EE&4#ZZ(<==( 31
 
 
 	
r>   c                 V    |                     dd           }|d S t          |          S NrW   )rW   )rn  rV   )r_   rk  rW   s      r<   _parse_and_validate_video_inputz4HCXVisionForCausalLM._parse_and_validate_video_input  s>     %jj)>EE&4( 3
 
 
 	
r>   image_input.c                 H    |                      |d         |d                   S rm  )forward_images)r_   rs  s     r<   _process_image_inputz)HCXVisionForCausalLM._process_image_input  s3     "" +,A B*+?@ # 
 
 	
r>   video_inputc                 :    |                      |d                   S rq  )forward_videos)r_   rw  s     r<   _process_video_inputz)HCXVisionForCausalLM._process_video_input  s*     "" +,A B # 
 
 	
r>   c                 |    i }|D ]6}|dk    rd|vr | j         di ||d<   |dk    rd|vr | j        di ||d<   7|S )NrI   r   rW   r   r8   )ro  rr  )r_   rk  
modalities	input_keys       r<   %_parse_and_validate_multimodal_inputsz:HCXVisionForCausalLM._parse_and_validate_multimodal_inputs  s    
   	V 	VI111hj6P6P'Kt'K'U'Uf'U'U
8$111hj6P6P'Kt'K'U'Uf'U'U
8$r>   c                 
    | j         di |}|sg S d}|D ]l}|dk    r/|d         }|                     |          }|t          |          z  }|dk    r/|d         }|                     |          }|t          |          z  }m|S )Nr8   r   r   )r~  rv  tuplerz  )	r_   rk  r|  multimodal_embeddingsr   rs  image_embeddingsrw  video_embeddingss	            r<   embed_multimodalz%HCXVisionForCausalLM.embed_multimodal  s     @T?II&II
 	I ;= # 	A 	AH8##(2#'#<#<[#I#I %/?)@)@@%8##(2#'#<#<[#I#I %/?)@)@@%$$r>   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r  )rT  model)r_   r  r  r  r  rk  hidden_statess          r<   r  zHCXVisionForCausalLM.forward  s>      + M+11y"6m 2 
 
 r>   rI   rK   c           
         t          |d          }d| j        j        v rdnd}|                     |          d d |d f         }|                    | j        j                  }|                     |          }d |D             }t          j        ||d          }t          ||
                                | j        j        | j        j        | j        j        | j        j        | j        | j        j        	          }t%          |          S )
NTconcatsiglipr   r%   rS  c                 ,    g | ]}t          |          S r8   r   )r   items     r<   r   z7HCXVisionForCausalLM.forward_images.<locals>.<listcomp>  s    AAATs4yyAAAr>   r(  )image_forward_outsimage_sizesnum_queries_vis_abstractorunpad
patch_size	grid_sizerc  r]  )r,   r   rW  rR  tora  r   rS   splitanyres_postprocessingr   re   num_queries_vis_abstractor_imager  r  
image_sizerc  r]  r  )r_   rI   rK   pixel_values_image_flatvisual_token_idxr  split_sizesimage_featuress           r<   ru  z#HCXVisionForCausalLM.forward_images  s'   
 #--@"N"N"N (D,>,I I I11q!../FGGAA   
 0229J9P2QQ!../ABBAA-@AAA"[);[aPPP /1*1133'+{'S+#)4(3,!%!A	
 	
 	
 ^$$$r>   rW   c                 
   t          d |D             d          }d| j        j        v rdnd}|                     |          d d |d f         }|                    | j        j                  }d}|g}g }|j        d         }| j        j	        r|dk    sJ |dk    r;|
                    | j        j                   ||z  }|
                    |           nN|
                    | j        j                   |dz  }|
                    |           |
                    | j        j                   ||dz
  z  }|
                    |           |
                    | j        j                   |dz  }|
                    |           n|D ]}	|	D ]}
t          |
          dk    r|
                    | j        j                   |dz  }|
                    |           |
                    | j        j                   |t          |
          z   dz
  }|
                    |           |                     |||          }g g }d}d}d	 |D             }|D ]}|t          |          z  }|
                    |                    dd                     ||         }||k    r3
                    t          j        |d
                     g }|dz  }d}~||k     rt#          d|d|          t          |          dk    sJ d|             t          |          t                    k    sJ d |D             }dgt%          |          t'          fdt)          t          |                    D                       S )Nc                     g | ]	}|D ]}|
S r8   r8   r   framesframes      r<   r   z7HCXVisionForCausalLM.forward_videos.<locals>.<listcomp>!  s%    IIIv&IIUIIIIr>   Tr  r  r   r%   rS  rJ   c                 6    g | ]}|D ]}t          |          S r8   r  r  s      r<   r   z7HCXVisionForCausalLM.forward_videos.<locals>.<listcomp>h  s?     
 
 
!f
 
=BCJJ
 
 
 
r>   r  zvideo_group_size=z < target_group_size=ztarget_features is not empty!! c                 ,    g | ]}t          |          S r8   r  r   s     r<   r   z7HCXVisionForCausalLM.forward_videos.<locals>.<listcomp>  s    GGG%3u::GGGr>   c              3   n   K   | ]/}t          j        |         |d z                               V  0dS )r%   N)rS   cat)r   r   idxs_per_videovideo_featuress     r<   r   z6HCXVisionForCausalLM.forward_videos.<locals>.<genexpr>  sY       
 
 In^A%6A9N%NOPP
 
 
 
 
 
r>   )r,   r   rW  rR  r  ra  r   r"  re  first_last_frames_slowr,  %num_queries_vis_abstractor_video_slow%num_queries_vis_abstractor_video_fastr   flattenrS   r  RuntimeErrorr   r  r   )r_   rW   pixel_values_videos_flatr  video_forward_outsgrid_idxr  r  len_total_framespixel_values_framespixel_values_frametarget_featurestarget_group_sizegroup_countervideo_groupsforward_outvideo_group_sizefeats_per_videor  r  s                     @@r<   ry  z#HCXVisionForCausalLM.forward_videos  s    $.II!4III$
 $
 $
 
 !)D,>,I I I11q!../GHHAA   
 0229J9P2QQ J	&(#-3A6;- )	3#q((((1$$+22KE   ,,  ****+22KE   A  ***+22KE   ,q00  ***+22KE   A  **** (; 3 3#*= 3 3&-..223:: KM   !A!((2223:: KM   $,c2D.E.E#E#I!((2223 ".. ;Y
 
 
 
%8
 
 
 . 	R 	RK[!1!11"";#6#6q!#<#<===+M:#444%%eiQ&G&G&GHHH"$"$%!!!$555"#P&6#P#P<M#P#PQQQ 6 ?##q(((?o?? )(( <  C$7$77777GG3FGGG:j99: 
 
 
 
 
3//00
 
 
 
 
 	
r>   c           	      $   t          t                    }|                                D ]X\  }}t          |          dk     st          |d                   dk     r3|d}}|                    d          s|                    d          snS|                    d          d d         |                    d          d         }}d                    |          }|dk    }t          |          D ]\  }}|d	vrt          ||                   |dz   k     r'||                             t                                 |	                                
                                                                                                }||         |xx         |z  cc<   t          |t          j                  rt          ||                   |dz   k     rN||                             t                                 |d
                             t                                 t          t          j        |d                    }||         |xx         |z  cc<   |d
         |xx         |gt          |          z  z  cc<   Zt#          |          S )Nr%   r   F_images_videosr@  rQ  r   )rA   	is_videosr  )r   rR   r   r   endswithr  joinr   r,  detachcpunumpyr   rh   rS   rT   unbindr   )	r_   rk  outputr   r   new_kis_video_sample_idx_vs	            r<   _prepare_multimodal_kwargsz/HCXVisionForCausalLM._prepare_multimodal_kwargs  sb   T""LLNN 	  	 DAq1vvzzS1YY]]8E::i(( 0I1F1F 0"#''#,,ss"3QWWS\\"5Ex#x/#,Q<<    R 0006%=))K!O;;u,,TVV444**002299;;B5M+..."4....EL11  6%=))K!O;;u,,TVV444{+22466:::el2155566B5M+..."4...;'444 9B9   444  F||r>   r  c                 6    | j                             |          S r\   )rT  compute_logits)r_   r  s     r<   r  z#HCXVisionForCausalLM.compute_logits  s     "11-@@@r>   weightsc                 J    t          |           }|                    |          S r\   )r+   load_weights)r_   r  loaders      r<   r  z!HCXVisionForCausalLM.load_weights  s%     #4((""7+++r>   c                 ^   t          |dg           sg }|j        r|j        dk    sJ t          d|j        dz             D ]V}t          d|j        dz             D ];}|dk    r|dk    r|j        s||z  |j        k    r|                    ||g           <Wfd|D             }|S |j        S )Nr]  r   r%   c                 >    g | ]\  }}|j         z  |j         z  gS r8   )r  )r   ysxsr   s      r<   r   zCHCXVisionForCausalLM._init_possible_resolutions.<locals>.<listcomp>  sB     ( ( (B -22B9Q4QR( ( (r>   )r_  r[  r9   r   use_1x1_gridr,  r]  )r_   re  r   r]  r   js     `   r<   r\  z/HCXVisionForCausalLM._init_possible_resolutions  s
   
 v5r:: 	/#% } +a////q&"6":;; @ @A"1f&:Q&>?? @ @66a1ffV5Hf$q5F$888077A???	@( ( ( ("6( ( ($ ('..r>   c           	         |j         }|j        dk    rFt          j        ||j                   }t	          |                                          j        |_        nh|j        dk    r;t          |j        |j	        |j
        z  dz  |||j         |j        |j                  }n"t          |j        ||| j        j                   }|S )NlinearcabstractorrJ   )r  r  r  r  r  r  r  )r   r   )r  r   r   r   r  r  r   r
  r  r  r  proj_pos_embproj_prenormr   rV  )r_   re  rV  r   input_hidden_sizera  s         r<   r`  z'HCXVisionForCausalLM._init_mm_projector  s     *5#x//9%68OPPL!%l&=&=&?&?!@!@!FL%66/"C"/":m>V"V"$5-#.#:++	 	 	LL ((! 1!-9	  L r>   rA  )%rM   rN   rO   packed_modules_mappingr   r|   r   classmethodri   rj  r   HCXVisionImageInputsro  HCXVisionVideoInputsrr  r  rS   rT   rv  rz  r   r~  r'   r  r"   r  rR   ru  ry  r  r  r   setr  r\  r`  r  r  s   @r<   rC  rC  L  sn        322$i0 
 BD 2' 2' 2'z 2'3 2' 2' 2' 2' 2' 2' 2'n F3 F3 F3: F F F [F

 
	$
 
 
 
 

 
	$
 
 
 

)
 
u|S 	!
 
 
 

)
 
u|S 	!
 
 
 
f     %% 
% % % %< <@-1 < < 2D8	
 |d*  
+	+    %!%,/% "L% 
u|S 	!	% % % %@h
!$u|"45h
 
u|S 	!h
 h
 h
 h
T6    >A|A 
	A A A A,%U\ 123, 
S, , , ,/ / /0      r>   rC  r   original_sizec                 "   |\  }}| j         dd          \  }}||z  }||z  }||k    r4||z  }t          ||z            }	||	z
  dz  }
| d d |
||
z
  d d f         }n3||z  }t          ||z            }||z
  dz  }
| d d d d |
||
z
  f         }|S )Nr%   rJ   )r"  ri   )r   r  original_widthoriginal_heightcurrent_heightcurrent_widthoriginal_aspect_ratiocurrent_aspect_ratioscale_factor
new_heightpaddingunpadded_tensor	new_widths                r<   unpad_imager    s    &3#NO$*L$4!NM*_<(>9333$~5<788
!J.14 Gnw.F$F!IJ%7566	 9,2 AAAw1H'H!HIr>   r]  c                 *   | \  }}d }d}t          d          }|D ]w\  }}t          ||z  ||z            }	t          ||	z            t          ||	z            }}
t          |
|z  ||z            }||z  |z
  }||k    s||k    r||k     r|}|}||f}x|S )Nr   inf)floatminri   )r  r]  r  r  best_fitmax_effective_resolutionmin_wasted_resolutionrr   rq   scaledownscaled_widthdownscaled_heighteffective_resolutionwasted_resolutions                 r<   select_best_resolutionr    s    &3#O^H !%LL- ' 'EN*F_,DEE&''%'(( ,  #00.?2R 
  
 #V^/CC"::: $<<<!$999';$$5!HOr>   r  grid_pinpointsr  c                     t          |t                    r|nt          j        |          }| \  }}t	          ||f|          \  }}||z  ||z  fS r\   )rh   rR   astliteral_evalr  )r  r  r  r]  r  r  rr   rq   s           r<   get_anyres_image_grid_shaper    sn     nd++	.n--  '1#NO*	.)+? MFE J* 444r>   image_featurerr   rq   r  r  rc  c           	      h   | d         }| dd          } ||z  |j         d         k    sJ d|d|d|j         d                     t          |||          \  }	}
|                     |
|	||d          } |r|                     ddddd	                                          } |                     dd                              dd	          } t          | |          } t          j        |  |d d d d f         j	        g | j         d d         dR  
                    | j                  fd
          } |                     dd                              dd          } nA|                     dddd	d                                          } |                     dd	          } t          j        || fd
          } | S )Nr   r%   zheight=z	 * width=z  != base_image_feature.shape[0]=rQ     rJ   r3   r  )r"  r  viewpermute
contiguousr  r  rS   r  expandr  device	transpose)r  rr   rq   r  r]  r  r  rc  base_image_featurenum_patch_widthnum_patch_heights              r<    reshape_and_unpad_image_featuresr  ,  s
    'q)!!""%ME>/5a8888A6AAuAA#5#;A#>AA 988 )D()) )%O% "&&/65" M  4%--aAq!<<GGII%--a33;;AqAA#M:>>	aaatm,6&,SbS16346 6 6M())	 
 
 
 &--a33==aCC%--aAq!<<GGII%--a33I1=AqIIIMr>   rQ  Fr  r  r  c                    ||z  x}}	|dk    r5|dz                                   s
J d            t          |dz            x}}	g }
t          |           D ]\  }}|j        d         dk    rt	          |||	||         ||||          }n>|d         }t          j        ||d                              |j                  fd          }|
	                    |           |
S )Nr   r   zn_queries must be square numberr%   )r  rr   rq   r  r]  r  r  rc  r  )
r7  ri   r   r"  r  rS   r  r  r  r,  )r  r  r]  r  r  rc  r  r  rr   rq   new_image_features	image_idxr  s                r<   r  r  Z  s3    *,,FU!A%%*C/;;== 	
 	
-	
 	
= 7<=== $-.@$A$A 1 1 	=q!A%%<+&y1%9#+	 	 	MM *!,M!Id 3 6 6}7K L LMST  M 	!!-0000r>   )r2   )rQ  F)ir  collectionsr   collections.abcr   r   r   	functoolsr   	itertoolsr   typingr	   r
   r  r   rS   torch.nnr   einopsr   timm.layersr   r   timm.models.regnetr   transformersr   r   r   vllm.configr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.cacher   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   clipr&   
interfacesr'   r(   r)   r  r*   utilsr+   r,   r-   r.   visionr/   EOTr0   r|   rQ   r1   ri   r  r=   r@   r  rV   r  rZ   r~   r   r   r   r   r   Moduler   r
  register_processorrC  rT   r  rR   r  r  r  r  r8   r>   r<   <module>r2     s   



 # # # # # # 7 7 7 7 7 7 7 7 7 7                   % % % % % % % %                  . . . . . . . . ' ' ' ' ' ' K K K K K K K K K K " " " " " " 3 3 3 3 3 3 F F F F F F / / / / / / > > > > > >         
 A @ @ @ @ @ @ @                . - - - - - > > > > > > > > ! ! ! ! ! ! L L L L L L L L L L % % % % % %            , + + + + +S   .S . . . '-
0 
0
0#s(O
0 	
0 
0 
0 
0E E E E E E E E" 1        $ 1 &
 &
 &
 &
 &
0 &
 &
 &
R'
 '
 '
 '
 '
"89P"Q '
 '
 '
TK
 K
 K
 K
 K
#:;R#S K
 K
 K
\(	(( ( ( ( 26	* * *
!*()@A* ($.	*
 * * * *( !%%)"# "# "#$t+"# :	"#
 d{"# "# (("# "# "# "#J    29   B[& [& [& [& [&29 [& [& [&| ('!	!,  
U U U U U29&8* U U 
Up U38_     *% t PU    85c3h5$uS#X//5 5 38_	5 5 5 5$+<++ + c3h	+
 uS#X/+ + + <+ \+ + + +j ')' 'U\*'d3i' uS#X/' 	'
 ' <' !$' ' 
%,' ' ' ' ' 'r>   