
    .`i                        U d dl Z d dlmZmZmZ d dlmZ d dl mZmZ d dl	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7 d dl8m9Z9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA d dlBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJ ddlKmLZLmMZMmNZN ddlOmPZPmQZQmRZRmSZS ddlTmUZU  G d deI          ZV G d  d!eI          ZWeVeWz  ZXeeYd"<   eZej        e[ej                 e[e\         dz  f         Z]d#Z^e\eYd$<    G d% d&          Z_ G d' d(          Z` G d) d*          Za G d+ d,e>          Zb G d- d.e<eb                   Zc G d/ d0e=eb                   Zdd1 Ze G d2 d3ejf                  Zg G d4 d5ejf                  Zh G d6 d7ejf                  Zi G d8 d9ejf                  Zj G d: d;ejf                  Zk G d< d=ejf                  Zl e3jm        edebec>           G d? d@ejf        eMeN                      ZndS )A    N)IterableMappingSequence)product)ceilsqrt)	AnnotatedAnyLiteral	TypeAlias)Image)
transforms)InterpolationMode)BatchFeaturePretrainedConfig
TensorType)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
get_act_fn)MMEncoderAttention)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TokenizerLike)Step3VisionEncoderConfig)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)run_dp_sharded_vision_modelc                       e Zd ZU dZed         ed<   eej         e	dddd          f         ed<   eej        dz   e	d	dd
d          f         ed<   eej         e	d          f         ed<   dS )Step3VLImagePixelInputsa  
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
        - bnp: Batch size * number of images * number of patches
        - hp: Height of patch
        - wp: Width of patch
    pixel_valuestypebn   hwNbnphpwppatch_pixel_valuesnum_patches)
__name__
__module____qualname____doc__r   __annotations__r	   torchTensorr-        w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/step3_vl.pyr8   r8   <   s         	 	 .
!!!!EL++dAsC*H*HHIIII!t[[4>>>    5<T):)::;;;;;;rL   r8   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	Step3VLImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - f: Image feature size
        - h: Hidden size (must match the hidden size of language model backbone)
    image_embedsr:   r;   fr=   dataN)rD   rE   rF   rG   r:   r   rH   r	   rI   rJ   r-   rK   rL   rM   rO   rO   P   sY           %3D'.
!222
EL++dC"="==
>>>>>>rL   rO   Step3VLImageInputs  MAX_IMAGE_SIZEc                       e Zd ZddZddZdS )Step3VisionProcessorbicubicNc           	         g d}g d}||n|}t          j        t          j                    t          j        ||          t          j        ||f|dk    rt
          j        nt
          j        d          g          | _        |nt          j        t          j                    t          j        ||          t          j        ||f|dk    rt
          j        nt
          j        d          g          nd | _	        d S )N)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?rX   T)interpolation	antialias)
r   ComposeToTensor	NormalizeResizer   BICUBICBILINEAR	transformpatch_transform)selfsizeinterpolation_mode
patch_sizemeanstds         rM   __init__zStep3VisionProcessor.__init__d   s%   222222#-#9ZZt
#+#%%$T3//!4L)Y66 #4";";*3"  

 
8 % '))(s33%#Z0-:: '8&?&?.7"&  
    	rL   Fc                     |r*d|                      |                              d          iS d|                     |                              d          iS )Nr9   r   )rc   	unsqueezerb   )rd   imageis_patchs      rM   __call__zStep3VisionProcessor.__call__   sW     	H"D$8$8$?$?$I$I!$L$LMM"DNN5$9$9$C$CA$F$FGGrL   )rX   NF)rD   rE   rF   rj   ro   rK   rL   rM   rW   rW   c   sC        #
 #
 #
 #
JH H H H H HrL   rW   c                      e Zd ZddeddfdZdededefdZ	 d d
ededeeeef                  deeeef                  de	deeeeeeef                  eeef         f         fdZ
dej        dej        fdZdededeeef         fdZdededeeef         fdZdededefdZdej        dedededef
dZdededeeef         fdZdej        deej        eej                 ee         dz  f         fdZdS )!ImagePatcherTenable_patchreturnNc                     || _         d S Nrs   )rd   rs   s     rM   rj   zImagePatcher.__init__   s    (rL   longshortc                 ^    |dk     r||z  dk    r|ndS ||z  dk    rt          |d          ndS )N  g      ?r        )min)rd   rx   ry   s      rM   determine_window_sizez"ImagePatcher.determine_window_size   sC    #:: 5L3..55A5"&,"2"2s5#;rL   333333?widthheightsizesstepsimg_rate_thrc           	      V   d|cxk    rdk    sn J d            g }t          ||          D ]U\  }}|\  }	}
|\  ||	k    rdnt          ||	z
  z  dz             }fdt          |          D             }t          |          dk    r|d         |	z   |k    r||	z
  |d<   ||
k    rdnt          ||
z
  z  dz             }fdt          |          D             }t          |          dk    r|d         |
z   |k    r||
z
  |d<   t	          j        t          t          ||                    t                    }|d d ddgf         |d d ddgf<   |	                    t	          j
        |||z   gd                     Wt	          j
        |d          }d	 |D             ||ffS )
Nr.   r   z#The `in_rate_thr` should lie in 0~1c                     g | ]}|z  S rK   rK   ).0istep_ws     rM   
<listcomp>z-ImagePatcher.slide_window.<locals>.<listcomp>       888avz888rL   c                     g | ]}|z  S rK   rK   )r   r   step_hs     rM   r   z-ImagePatcher.slide_window.<locals>.<listcomp>   r   rL   dtype)axisc           
          g | ]f}t          |d                    t          |d                   t          |d         |d          z
            t          |d         |d         z
            fgS )r   r.      r<   int)r   boxs     rM   r   z-ImagePatcher.slide_window.<locals>.<listcomp>   sm     
 
 
 Q[[#c!f++s3q6CF?';';SQ#a&=Q=QR
 
 
rL   )zipr   rangelennparraylistr   r   appendconcatenate)rd   r   r   r   r   r   windowsre   stepsize_wsize_hx_numx_starty_numy_startstartr   r   s                   @@rM   slide_windowzImagePatcher.slide_window   s    L%%%%A%%%%%'L%%%eU++ 	J 	JJD$!NFF!NFF&AAdEFNf3Lq3P.Q.QE88885<<888G7||aGBK&$85$@$@#fn6))AAtVf_4NQR4R/S/SE88885<<888G7||aGBK&$86$A$A$voHT''7";";<<CHHHE$QQQAY/E!!!aV)NN2>5%$,*?aHHHIIII.q111
 

 
 
 5> 	rL   imgc                     |j         \  }}||k    r|S t          ||          }t          j        |j        ||fd          }|                    |d           |S )Nr   r   r   )re   maxr   newmodepaste)rd   r   r>   r=   re   paddeds         rM   
square_padzImagePatcher.square_pad   sZ    x166J1ayy38dD\155S&!!!rL   	img_width
img_heightc                 |    ||z  }t          ||          dk     r |dk    s|dk     rt          ||          }||fS ||fS )N    r|   g      ?)r~   r   )rd   r   r   rationew_sizes        rM   get_image_size_for_paddingz'ImagePatcher.get_image_size_for_padding   sX     J&z9%%**		UU]]:y11HX%%*$$rL   c                     t          ||          t          k    r<t          t          ||          z  }t          ||z            }t          ||z            }||fS rv   )r   rU   r   )rd   r   r   scale_factors       rM   get_image_size_for_preprocessz*ImagePatcher.get_image_size_for_preprocess   s\     z9%%66)C
I,F,FFLI455IZ,677J*$$rL   window_sizec                 F   ||z  }||z  }|dk     r|}n4|||z  z
  }|dk    rt          |          dz   nt          |          }||z  }|dk     r|}n4|||z  z
  }	|	dk    rt          |          dz   nt          |          }||z  }t          |          t          |          fS )Nr.   g?r   )
rd   r   r   r   w_ratioh_ratio	width_new	decimal_w
height_new	decimal_hs
             rM   get_image_size_for_cropz$ImagePatcher.get_image_size_for_crop   s     k){*Q;;!II){"::I*3c//c'llQ&&s7||G#g-IQ;;#JJ*";;I*3c//c'llQ&&s7||G$w.J9~~s:..rL   r   jthtwc                 D    |                     ||||z   ||z   f          }|S rv   )crop)rd   r   r   r   r   r   targets          rM   
patch_cropzImagePatcher.patch_crop   s)    1aRR011rL   c                    |                      ||          \  }}|                     ||          \  }}|                     t          ||          t	          ||                    }|dk    s| j        sdS |                     |||          \  }}|                     ||||fg||fg          \  }\  }}t          |          dz
  |z  dz   }t          |          dk    rt          |          |z  dk    r|dz  }t          |          |fS )Nr   r   r.   )	r   r   r   r   r~   rs   r   r   r   )rd   r   r   r   center_listr   r   	full_rowss           rM   get_num_patcheszImagePatcher.get_num_patches   sF    $ ? ?	: V V	: $ B Bz!
 !
	: 00
I&&J	(B(B
 
 !4#44$($@$@:{% %!Iz +/*;*;{+,{+,	+ +'K% [))A-%7!;I;!##K(8(85(@A(E(EQ	{##Y..rL   c                 N   |j         \  }}|                     ||          \  }}||k    s||k    r|                     |          }|j         \  }}|                     ||          \  }}|                    ||ft
          j        j                  }|                     t          ||          t          ||                    }|dk    s| j        s|g d fS |                     |||          \  }}||f||fk    r(|                    ||ft
          j        j                  }n|}g }g |                     ||||fg||fg          \  }	\  }
}t          |	          D ][\  }}|\  }}}}|                     |||||          }|                    |           |dz   |
z  dk    r                    |           \r0d         t#          |          dz
  k    r                                 ||t#          |          dk    r(fdt'          t#          |                    D             nd fS )Nr   r.   r   c                     g | ]}|v S rK   rK   )r   r   newliness     rM   r   z)ImagePatcher.__call__.<locals>.<listcomp>=  s    <<<1h<<<rL   )re   r   r   r   resizer   
Resamplingra   r   r   r~   rs   r   r   	enumerater   r   r   popr   )rd   r   r   r   new_img_widthnew_img_heightr   img_for_croppatchesr   r   r   patch_idcenter_lf_pointxypatch_wpatch_h	big_patchr   s                      @rM   ro   zImagePatcher.__call__  s    !$	:(,(G(Gz)
 )
%~ I%%:)E)E//#&&C$'H!Iz(,(J(Jz)
 )
%~ jj-8%:J:STT00..NM0R0R
 
 !4#4D= ,0,H,H~{- -)M> ~.9j2III"zz"N3U5E5N     #GH*.*;*;{+,{+,	+ +'K% .7{-C-C . .)/)8&1gw OOL!QQQ	y)))qLE)Q..OOH--- HRLCLL1,<<< w<<!## =<<<c'll(;(;<<<< rL   )T)r   )rD   rE   rF   boolrj   r   r   r   tuplefloatr   r   r   r   r   r   r   r   ro   rK   rL   rM   rr   rr      sd       ) )T )T ) ) ) )<# <c <c < < < < "! !! ! E#s(O$	!
 E#s(O$! ! 
tE#sC,-.c3h?	@! ! ! !Fek ek    %%*-%	sCx% % % %%%*-%	sCx% % % %//*-/<?/ / / /(ek c c s     / /# /%S/ / / / /28;8	u{D-tDzD/@@	A8 8 8 8 8 8rL   rr   c                       e Zd Zdededdf fdZedefd            Zdededefd	Z	d
e
ej                 de
e         fdZ	 dd
e
ej                 dede
ej                 fdZdede
e         dz  deee
e         f         fdZdedeee
e         f         fdZdedede
e         dz  deee
e         f         fdZdedede
e         defdZ	 	 	 ddee
e         z  dz  d
ej        e
ej                 z  dz  deez  dz  defdZ xZS )Step3VLProcessorconfig	tokenizerrt   Nc                    t                                                       || _        || _        d| _        d| _        t          | j        d| j                  | _        d| _        d| _	        d| _
        | j
        | j        z  | _        | j
        | j	        z  | _        t          | j        j        dd          }t          |	          | _        d S )
Nr{   r}   bilinear   Q   
<im_patch>rs   Trw   )superrj   r   r   
image_sizerg   rW   image_preprocessornum_image_feature_sizenum_patch_feature_sizeimage_tokenimage_feature_placeholderpatch_feature_placeholdergetattrvision_configrr   patcher)rd   r   r   rs   	__class__s       rM   rj   zStep3VLProcessor.__init__D  s    
 	""6OZ#
 #
 '*#&(#')-)9D<W)W&)-)9D<W)W& t{8.$OO#>>>rL   c                 J    | j                                         | j                 S rv   )r   	get_vocabr   rd   s    rM   image_token_idzStep3VLProcessor.image_token_id^  s    ~''))$*:;;rL   r   r   c                 t    | j                             ||          \  }}|| j        dz   z  | j        z   dz   |z   S )Nr   )r   r   r   r   )rd   r   r   rC   num_newliness        rM   get_num_image_tokensz%Step3VLProcessor.get_num_image_tokensb  sR    $(L$@$@J$W$W!\ 46:;)* 	
rL   imagesc                 d    g }|D ]*}|                     |                     |                     +|S rv   )r   r   )rd   r  resultr   s       rM   _split_imageszStep3VLProcessor._split_imagesl  s<     	- 	-CMM$,,s++,,,,rL   Frn   c                 $      fd|D             S )Nc                 J    g | ]}                     |           d          S )rn   r9   )r   )r   r   rn   rd   s     rM   r   zDStep3VLProcessor._convert_images_to_pixel_values.<locals>.<listcomp>w  sA     
 
 
 ##C(#;;NK
 
 
rL   rK   )rd   r  rn   s   ` `rM   _convert_images_to_pixel_valuesz0Step3VLProcessor._convert_images_to_pixel_valuesr  s4    

 
 
 
 

 
 
 	
rL   rC   patch_newline_maskc                    d}g }t          |          D ]}t          |          |k    sJ |d| j         dz  }|                    | j                            d          g| j        g| j        z  z   | j                            d          gz              |r:||         r2|dz  }|                    | j                            d                     ||fS )N z<patch_start>z<patch_end>z<patch_newline>)	r   r   r   extendr   convert_tokens_to_idsr   r   r   )rd   rC   r
  text	token_idsr   s         rM   _get_patch_replz Step3VLProcessor._get_patch_repl|  s   
 	{## 	 	A)**k9999OD$BOOOOD55oFFG&'$*EEF>77FFGH  
 " &8&; ))  N889JKK   YrL   
num_imagesc                     d| j          d}| j                            d          g| j        g| j        z  z   | j                            d          gz   }||z  ||z  fS )Nz
<im_start>z<im_end>)r   r   r  r   r   )rd   r  r  r  s       rM   _get_image_replz Step3VLProcessor._get_image_repl  s}     ED:DDD^11,??@"#d&AAB~33J??@A 	
 j )j"888rL   patch_new_line_idxc                     |dk    r|                      ||          \  }}nd}g }|                     |          \  }}||z   ||z   fS )Nr   r  )r  r  )rd   r  rC   r  
patch_replpatch_repl_ids
image_replimage_repl_idss           rM   _get_image_repl_featuresz)Step3VLProcessor._get_image_repl_features  sh     ??)-)=)=/* *&J JN%)%9%9*%E%E"
NJ&(GGGrL   r  placeholderreplsc                 \   |                     |          }t          |          dz
  t          |          k    rt          d          |d         g}t          |          D ]8\  }}|                    |           |                    ||dz                       9d                    |          S )Nr.   zEThe number of placeholders does not match the number of replacements.r   r  )splitr   
ValueErrorr   r   join)rd   r  r  r  partsr  r   repls           rM   replace_placeholderz$Step3VLProcessor.replace_placeholder  s    

;''u::>SZZ''W   ( '' 	( 	(GAtMM$MM%A,''''wwvrL   return_tensorsc                     |g }t          |t                    s|g}|g }t          |t                    s|g}t          |          dk    ri }                     |          }n                     |          }g }g }g }	g g }
g }|D ]\  }}}|                                         |g                     t          |          dk    r*|                                         |d                     |                    t          |                                          dt          |          |          \  }}                    |           |
                    |           ||	                    |           t          j
        |          |d}|rt          j
        |          |d<   |	r#t          j        |	t          j                  |d<    fd	|D             }                     |          }t          i |||
          S )Nr   Tr  r.   )r9   rC   rB   r   r
  c                 H    g | ]}                     |j                  S rK   )r$  r   )r   timage_repl_str_lstrd   s     rM   r   z-Step3VLProcessor.__call__.<locals>.<listcomp>  s?        ((D,<>PQQ  rL   )tensor_type)
isinstancer   r   r   r  r  r	  r   r  rI   cattensorr   r   )rd   r  r  r%  image_inputstext_inputssplitted_images_datapixel_values_lstpatch_pixel_values_lstpatch_newline_mask_lstimage_repl_ids_lstrC   raw_imgimg_patchesr
  image_repl_strr  r)  s   `                @rM   ro   zStep3VLProcessor.__call__  s    <D$%% 	6D>F&$'' 	XFv;;!L....KK#'#5#5f#=#= !%'"%'"!#!#K<P F F8&8 ''(L(LgY(W(WXXX{##a''*11<<[SW<XX   ""3{#3#3444151N1Ns;'');2 2. #)).999")).999%1*112DEEE !&	*: ; ;* L & W5:Y?U5V5V12% 5:\*%*6 6 612      D ....K '
 
 
 	
rL   rp   )NNN)rD   rE   rF   r   r*   rj   propertyr   r   r  r   r   ImageWithPatchesr  r   rI   rJ   r	  r   strr  r  r  r$  r   r   ro   __classcell__r   s   @rM   r   r   C  s       ? ? !? 
	? ? ? ? ? ?4 < < < < X<
c 
s 
s 
 
 
 
D$5 $?O:P     
 
U[!
 
 
el		
 
 
 
 !J- 
sDI~		   ,
9
9 
sDI~	
9 
9 
9 
9HH H !J-	H
 
sDI~	H H H H  # d3i TW    " (,9=26	C
 C
DIo$C
 d5;//$6C
 j(4/	C

 
C
 C
 C
 C
 C
 C
 C
 C
rL   r   c                       e Zd ZdefdZdeeedz  f         fdZdefdZ	dedeeef         deeef         fdZ
defd	Zd
edefdZdS )Step3VLProcessingInfort   c                 j    t          |                                 |                                           S rv   )r   get_hf_configget_tokenizerr   s    rM   get_hf_processorz&Step3VLProcessingInfo.get_hf_processor  s2        
 
 	
rL   Nc                 
    dd iS Nrm   rK   r   s    rM   get_supported_mm_limitsz-Step3VLProcessingInfo.get_supported_mm_limits
  s    rL   c                     |                                  }|                    |                                 j        |                                 j                  S rv   )rB  r  !get_image_size_with_most_featuresr   r   )rd   hf_processors     rM   get_max_image_tokensz*Step3VLProcessingInfo.get_max_image_tokens  sO    ,,..002244:2244;
 
 	
rL   seq_len	mm_countsc                 .    d|                                  iS rD  )rI  )rd   rJ  rK  s      rM   get_mm_max_tokens_per_itemz0Step3VLProcessingInfo.get_mm_max_tokens_per_item  s    
 224455rL   c                 "    t          dd          S )NrT   )r!   r   s    rM   rG  z7Step3VLProcessingInfo.get_image_size_with_most_features  s    t$$$rL   mm_datac                      t          |          dk    sd|vrt          d          |d         }t          |t          t          f          s|g}t           fd|D                       S )Nr.   rm   z5mm_data could only contain one key 'image' for steo1oc              3   |   K   | ]6}                                                     |j        |j                  V  7d S rv   )rB  r  r   r   )r   r   rd   s     rM   	<genexpr>z:Step3VLProcessingInfo.get_num_mm_tokens.<locals>.<genexpr>&  sW       
 
 !!##88CJOO
 
 
 
 
 
rL   )r   r   r+  r   r   sum)rd   rO  
image_datas   `  rM   get_num_mm_tokensz'Step3VLProcessingInfo.get_num_mm_tokens  s    w<<1w 6 6TUUUW%
*tUm44 	&$J 
 
 
 
!
 
 
 
 
 	
rL   )rD   rE   rF   r   rB  r   r:  r   rE  rI  rM  r!   rG  r   rU  rK   rL   rM   r>  r>    s        
"2 
 
 
 
cDj)A    
c 
 
 
 
66 38$6 
c		6 6 6 6%9 % % % %
); 
 
 
 
 
 
 
rL   r>  c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	Step3VLDummyInputsBuilderrK  rt   c                 8    |                     dd          }d|z  S )Nrm   r   r   )get)rd   rK  r  s      rM   get_dummy_textz(Step3VLDummyInputsBuilder.get_dummy_text-  s     ]]7A..
j((rL   NrJ  
mm_optionsc                     | j                                         \  }}|                    dd          }|r|                    d          nd }d|                     ||||          iS )Nrm   r   )r   r   r  	overrides)inforG  rY  _get_dummy_images)rd   rJ  rK  r[  target_widthtarget_heightr  image_overridess           rM   get_dummy_mm_dataz+Step3VLDummyInputsBuilder.get_dummy_mm_data1  s|     '+i&Q&Q&S&S#m]]7A..
5?I*..111T T++"$%)	 ,  
 	
rL   rv   )
rD   rE   rF   r   r:  r   rZ  r   r   rc  rK   rL   rM   rW  rW  ,  s        )S(9 )c ) ) ) ) =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rL   rW  c            	       v    e Zd Zdedeeef         dedee	         fdZ
dedeeef         deeef         fdZdS )	Step3VLMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsrt   c                      | j         j        di |j        dt          ffd}t	          dg|          gS )Nitem_idxc                 @   d         |          }t          |d         j                  }|dk    r=|d         j        }                    d||                                          d         }n                    ddd           d         }t	          j        |          S )Nrm   rC   r   r
  r.   )seqembed_token_id)r   rR   r  tolistr(   select_token_id)rj  out_itemrC   r
  r  rH  image_placeholder_token_idrh  s        rM   get_replacement_step1ozNStep3VLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_step1oP  s    $W-h7Hh}5:;;KQ%-.B%C%H"!-!F!F{$6$=$=$?$?" "" ".!F!Fq!T!R!RST!U&6"9   rL   rm   )modalityr   replacementrK   )r^  rB  r   r   r&   )rd   rf  rg  rh  rr  rH  rq  s      ` @@rM   _get_prompt_updatesz.Step3VLMultiModalProcessor._get_prompt_updatesG  s     2ty1KK4JKK%1%@"	S 	 	 	 	 	 	 	 	   232  
 	
rL   	hf_inputsc           	         |                     dt          j        d                    }t          t	          j        d          t	          j        d|          t	          j        d          t	          j        d|                    S )NrC   r   rm   )r9   rB   rC   r
  )rY  rI   emptydictr   batchedflat_from_sizes)rd   rv  rg  rC   s       rM   _get_mm_fields_configz0Step3VLMultiModalProcessor._get_mm_fields_configg  s~    
  mmM5;q>>BB.6w??4D    .5g>>4D   	
 	
 	
 		
rL   N)rD   rE   rF   r"   r   r:  r
   r    r   r'   ru  r   objectr   r|  rK   rL   rM   re  re  F  s        
%
 !(S 1
 -	

 
,	
 
 
 
@

 !(V 4
 
++	,	
 
 
 
 
 
rL   re  c                 &   |                      d          }|                     d          }|d d         |dd          }}t          t          j        |j        d         dz
                      }t          t          j        |                    }| j        }||k    r|                    d|||                              dddd          	                                }|
                    t          j                  }t          j        |||fddd	          
                    |          }|                    dddd          }|                    ||z  |          }t          j        ||gd
          }	|	                    d||z  dz   |          }	|	S | S )Nr   r   r.   r<   r   rX   TF)re   r   r[   align_cornersdim)re   squeezer   mathr   shaper   viewpermute
contiguoustorI   float32Finterpolater,  )
abs_postgt_sizer  abs_pos_new	cls_tokenold_pos_embedsrc_sizer   new_pos_embedvision_pos_embeds
             rM   get_abs_posr  z  s   
,,r

C//!$$K*2A2ABB}I49[.q1A56677H49X&&''HME8q(Hc::WQ1a  Z\\ 	
 &((77H%
 
 
 "U)) 	 &--aAq99%**8h+>DD 9i%?QGGG+00Hx4G!4KSQQrL   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Step3VisionEmbeddingsr   c                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        d| j                            | _        t          |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        d| _        t          j                            | j        dz   | j                  | _        |                     dt          j        | j        dz                                 d          d	           d S )
Nr.   T)in_channelsout_channelskernel_sizestridebiasr   r|   position_ids)r.   r   F)
persistent)r   rj   r   hidden_size	embed_dimr   rg   nn	ParameterrI   randnclass_embeddingr   num_channelspatch_embeddingrC   pad_tp_size	Embeddingposition_embeddingregister_bufferarangeexpand)rd   r   r   s     rM   rj   zStep3VisionEmbeddings.__init__  s(   + + +!|EK4>,J,JKK*+? 
  
  
 !Ot>1D"'("4"4q $.#
 #
 	L)A-..55g>> 	 	
 	
 	
 	
 	
rL   r9   rt   c                 *   |j         d         }|                     |          }|                    d                              dd          }| j                            |dd          }t          j        ||gd          }|t          | 	                    | j
                  |                    d                    z   }t          j        |d d dd d f                             d                              d| j        dz
  d          |gd          }|S )Nr   r   r.   r   r  )r  r  flatten	transposer  r  rI   r,  r  r  r  re   rl   repeatr  )rd   r9   
batch_sizepatch_embedsclass_embeds
embeddingss         rM   forwardzStep3VisionEmbeddings.forward  s'   !'*
++
 
 $++A..88A>> +22:q"EEYl;CCC
+##D$5668I8I!8L8L#
 #
 

 Y111a7#--a00774;Ka;OQRSS 
 
 

 rL   )	rD   rE   rF   r+   rj   rI   rJ   r  r;  r<  s   @rM   r  r    sk        
7 
 
 
 
 
 
:EL U\        rL   r  c                   T     e Zd ZdZ	 	 	 ddedz  dedef fdZd	ej	        fd
Z
 xZS )Step3VisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr  Fquant_configprefixuse_data_parallelc           	      H   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        dz  | _        |rdnt                      }| j        |z  dk    sJ | j        |z  | _
        | j
        | j        z  | _        t          | j        | j        | j        d|| d|          | _        t          | j        | j        d|| d|          | _        t!          | j
        | j        | j                  | _        d S )Ng      r.   r   Tz	.qkv_projr  r  r  
disable_tpz	.out_proj)r   rj   r   r  r  num_attention_headstotal_num_headshead_dimscaler   	num_headsq_sizer   qkv_projr   out_projr   attn)rd   r   r  r  r  tp_sizer   s         rM   rj   zStep3VisionAttention.__init__  s4    	+%9$*>>]D(
(T!!.R.T.T#g-2222-8nt}4)NM %'''(
 
 
 *NN%'''(
 
 
 't~t}djQQ			rL   hidden_statesc                     |                                 \  }}}|                     |          \  }}|                    dd          \  }}}|                     |||          }	|                     |	          \  }	}|	S )z#Input shape: Batch x Time x Channelr<   r   )chunksr  )re   r  chunkr  r  )
rd   r  bsztgt_len_qkvqkvattn_outputs
             rM   r  zStep3VisionAttention.forward  s    
 (,,..Wa }--Q))1")--1a ii1a(({33QrL   Nr  F)rD   rE   rF   rG   r   r:  r   rj   rI   rJ   r  r;  r<  s   @rM   r  r    s        GG
 37"'(R (R )4/(R 	(R
  (R (R (R (R (R (RT|       rL   r  c                   ^     e Zd Z	 	 	 ddedz  dedef fdZdej        d	ej        fd
Z	 xZ
S )Step3VisionMLPNr  Fr  r  r  c                 $   t                                                       || _        t          |j                  | _        t          |j        |j        d|| d|          | _	        t          |j        |j        d|| d|          | _        d S )NTz.fc1r  z.fc2)r   rj   r   r   
hidden_actactivation_fnr   r  intermediate_sizefc1r   fc2rd   r   r  r  r  r   s        rM   rj   zStep3VisionMLP.__init__  s     	'(9::'$%???(
 
 
 %$%???(
 
 
rL   r  rt   c                     |                      |          \  }}|                     |          }|                     |          \  }}|S rv   )r  r  r  )rd   r  r  s      rM   r  zStep3VisionMLP.forward+  sG    88M22q**=9988M22qrL   r  )rD   rE   rF   r   r:  r   rj   rI   rJ   r  r;  r<  s   @rM   r  r    s         37"'
 
 )4/
 	

  
 
 
 
 
 
6U\ el        rL   r  c            	       b     e Zd Z	 	 	 ddededz  dedef fdZd	ej	        d
ej
        fdZ xZS )Step3VisionEncoderLayerNr  Fr   r  r  r  c                    t                                                       || _        |j        | _        t          ||| d| j                  | _        t          j        | j        |j	                  | _
        t          ||| d| j                  | _        t          j        | j        |j	                  | _        d S )Nz
.self_attnr  r  )epsz.mlp)r   rj   r  r  r  r  	self_attnr  	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r  s        rM   rj   z Step3VisionEncoderLayer.__init__3  s     	!2+-((("4	
 
 
 <F<QRRR!???"4	
 
 
 <F<QRRRrL   r  rt   c                     ||                      |                     |                    z   }||                     |                     |                    z   }|S rv   )r  r  r  r  rd   r  s     rM   r  zStep3VisionEncoderLayer.forwardL  sR     &(8(89V9V(W(WW%(8(8-9P9P(Q(QQrL   r  )rD   rE   rF   r+   r   r:  r   rj   rI   rJ   FloatTensorr  r;  r<  s   @rM   r  r  2  s         37"'S S(S )4/S 	S
  S S S S S S2| 
	       rL   r  c            	       D     e Zd Z	 	 	 d
dededz  dedef fdZd	 Z xZ	S )Step3VisionEncoderNr  Fr   r  r  r  c                      t                                                        _        | _        t	          j         fdt          j                  D                        _        d S )Nc           	      J    g | ]}t           d | j                   S )z.layers.r  )r  r  )r   r   r   r  r  rd   s     rM   r   z/Step3VisionEncoder.__init__.<locals>.<listcomp>a  sW         ( $11a11&*&<	    rL   )	r   rj   r   r  r  
ModuleListr   num_hidden_layerslayersr  s   ```` rM   rj   zStep3VisionEncoder.__init__V  s     	!2m       v788  

 

rL   c                 4    |}| j         D ]} ||          }|S rv   )r  )rd   inputs_embedsr  encoder_layers       rM   r  zStep3VisionEncoder.forwardl  s1     &![ 	9 	9M)M-88MMrL   r  )
rD   rE   rF   r+   r   r:  r   rj   r  r;  r<  s   @rM   r  r  U  s         37"'
 
(
 )4/
 	

  
 
 
 
 
 
,      rL   r  c            	       T     e Zd Z	 	 	 ddededz  dedef fdZd	ej	        fd
Z
 xZS )Step3VisionTransformerNr  Fr   r  r  r  c                     t                                                       || _        || _        |j        | _        t          |          | _        t          ||| d| j                  | _        d S )Nz.transformerr  )	r   rj   r   r  r   r  r  r  transformerr  s        rM   rj   zStep3VisionTransformer.__init__w  sx     	!2 +/77-***"4	
 
 
rL   r9   c                     |                      |          }| j        rt          || j                  }n|                     |          }|S N)r  )r  r  r6   r  )rd   r9   r  s      rM   r  zStep3VisionTransformer.forward  sQ     55! 	J7tGWXXMM ,,=,IIMrL   r  )rD   rE   rF   r+   r   r:  r   rj   rI   rJ   r  r;  r<  s   @rM   r  r  v  s         37"'
 
(
 )4/
 	

  
 
 
 
 
 
&	l	 	 	 	 	 	 	 	rL   r  )r^  dummy_inputsc                   P    e Zd Z eddd          ZdZedededed	z  fd
            Z	ddde
dedd	f fdZed             Zed             Zdeded	z  fdZdej        dej        fdZdej        dej        fdZdedeej        df         fdZdefdZ	 d*d	dddej        ded	z  dej        d	z  d edej        f
 fd!Z	 	 d+dej        d"ej        d#ed	z  d$ej        d	z  dedej        ez  fd%Zd&ej        dej        d	z  fd'Zd(eeeej        f                  fd)Z  xZ!S ),Step3VLForConditionalGenerationzlanguage_model.model.zlanguage_model.lm_head.)zmodel.zlm_head.)orig_to_new_prefixTrs  r   rt   Nc                 N    |                     d          rdS t          d          )Nrm   r   z Only image modality is supported)
startswithr   )clsrs  r   s      rM   get_placeholder_strz3Step3VLForConditionalGeneration.get_placeholder_str  s,    w'' 	 <;<<<rL   r  )r  vllm_configr  c          	      h   t                                                       |j        j        }|j        j        }|| _        || _        |j        dk    | _        |                     |d          5  t          |j
        d t          |d          | j                  | _        t          |j
        j        |j
        j        d|j                  | _        t          |j
        j        |j
        j        dz  ddd	          | _        t'          j        |j
        j        dz  |j        |j        
          | _        d d d            n# 1 swxY w Y   |                     |          5  t1          ||j        t          |d                    | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )NrR   rm   vision_modelr  r   )r  r  r<   r.   )r  r  padding)r  language_model)r  	hf_configr  )r   rj   model_configr
  multimodal_configr   mm_encoder_tp_moder  _mark_tower_modelr  r   r5   r  r   r  output_hidden_sizeunderstand_projector_stridevit_downsamplervit_downsampler2r  Linearprojector_biasvit_large_projector_mark_language_modelr4   text_configr	  make_empty_intermediate_tensors)rd   r  r  r   r  r   s        rM   rj   z(Step3VLForConditionalGeneration.__init__  sA   )3'4F!2!2!E!O##K99 	 	 6$#FN;;"&"8	! ! !D $/$0$79	$ $ $D  %0$7$7!;% % %D! (*y$7!;"*( ( (D$)	 	 	 	 	 	 	 	 	 	 	 	 	 	 	4 &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   .CD??EE+FFFc                 N    t          |                                           j        S rv   )next
parametersdevicer   s    rM   r  z&Step3VLForConditionalGeneration.device  s    DOO%%&&--rL   c                 N    t          |                                           j        S rv   )r  r  r   r   s    rM   r   z%Step3VLForConditionalGeneration.dtype  s    DOO%%&&,,rL   kwargsc                    |                     dd           }|                     dd           }|                     dd           }|                     dd           }||d S |Gt          d|                    | j                  ||                    | j                  nd |          S |)t	          d|                    | j                            S t          d          )Nr9   rB   rC   rP   )r:   r9   rB   rC   )r:   rP   z This line should be unreachable.)r   r8   r  r   rO   AssertionError)rd   r  r9   rB   rC   rP   s         rM   _parse_and_validate_image_inputz?Step3VLForConditionalGeneration._parse_and_validate_image_input  s     zz.$77#ZZ(<dCCjj55zz.$77L$84#*#)__TZ88%1 $6#8#8#D#D#D'    #.#)__TZ88   
 ?@@@rL   image_featuresc                    |j         d d         \  }}t          t          |                    }|                    ddd                              |d||          }|                     |          }|                     |          }|                    d          }|                    ||d                              ddd          }|                     |          }|S )Nr   r   r.   r   )	r  r   r   r  r  r  r  re   r  )rd   r"  BPHWn_dims         rM   _process_image_featuresz7Step3VLForConditionalGeneration._process_image_features   s    #BQB'1a\\'//1a88==aRLL--n==..~>>##A&&',,Qr::BB1aKK11.AArL   input_tensorc                 D    |                      |          d d dd f         S )Nr|   )r  )rd   r)  s     rM   _get_vision_model_outputz8Step3VLForConditionalGeneration._get_vision_model_output  s'      ..qqq!""u55rL   image_input.c                    |d         dk    r	|d         }nH|                      |d                   }|d         |                      |d                   nd }|d         }|                     |          }||                     |          nd }g }d}t          |          D ]\  }}g }	|dk    rA||||z            }
|	                    |
                    d|
j        d                              |	                    ||                             d|j        d                              ||z  }|                    t          |	          dk    rt          j        |	          n|	d                    |S )	Nr:   rP   r9   rB   rC   r   r   r.   )	r+  r(  r   r   r  r  r   rI   r,  )rd   r,  r"  patch_image_featuresrC   merged_image_featurescur_patch_idxr   	num_patchcur_featurepatch_slices              rM   _process_image_inputz4Step3VLForConditionalGeneration._process_image_input  s    v.00(8NN!::;~;VWWN 34@ --k:N.OPPP !
 &m4K55nEE $/ (()=>>> 	 !#%k22 	 	LAyK1}}2!MI$== "";#3#3B8I"8M#N#NOOO~a055b.:Nr:RSSTTTY&M!((*-k*:*:Q*>*>	+&&&KPQN    %$rL   c                 R     | j         di |}|g S |                     |          }|S )NrK   )r!  r4  )rd   r  r,  vision_embeddingss       rM   embed_multimodalz0Step3VLForConditionalGeneration.embed_multimodal3  s?    :d:DDVDDI 55kBB  rL   )is_multimodalhandle_oov_mm_token	input_idsmultimodal_embeddingsr8  r9  c                    ||!t                                          |          S t                                          ||||          S )N)r;  r8  r9  )r   embed_input_ids)rd   r:  r;  r8  r9  r   s        rM   r=  z/Step3VLForConditionalGeneration.embed_input_ids:  sU     !(M,A77**9555ww&&"7' 3	 ' 
 
 	
rL   	positionsintermediate_tensorsr  c                 @    |d }|                      ||||          }|S r  )r	  )rd   r:  r>  r?  r  r  r  s          rM   r  z'Step3VLForConditionalGeneration.forwardN  s;      + M++y"6m , 
 
 rL   r  c                 6    | j                             |          S rv   )r	  compute_logitsr  s     rM   rB  z.Step3VLForConditionalGeneration.compute_logits_  s     "11-@@@rL   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r2   load_weightshf_to_vllm_mapper)rd   rC  loaders      rM   rF  z,Step3VLForConditionalGeneration.load_weightse  s+    "4((""743I"JJJrL   rv   )NN)"rD   rE   rF   r3   rG  supports_encoder_tp_dataclassmethodr:  r   r  r   rj   r8  r  r   r}  rS   r!  rI   rJ   r(  r+  r   r4  r/   r7  r   r=  r)   r  rB  r   rF  r;  r<  s   @rM   r  r    s#        &-1
 
    $=3 =3 =3: = = = [= BD -
 -
 -
z -
3 -
 -
 -
 -
 -
 -
 -
^ . . X. - - X-AA	d	"A A A A:	el 	u| 	 	 	 	6U\ 6el 6 6 6 6#%-#%	u|S 	!#% #% #% #%J!,@ ! ! ! ! >B

 .2$(
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
0 <@-1 < < 2D8	
 |d*  
+	+   "A|A 
	A A A AKHU33D-E$F K K K K K K K KrL   r  )or  collections.abcr   r   r   	itertoolsr   r   r   typingr	   r
   r   r   numpyr   rI   torch.nnr  torch.nn.functional
functionalr  PILr   torchvisionr   !torchvision.transforms.functionalr   transformersr   r   r   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r    vllm.multimodal.parser!   r"   vllm.multimodal.processingr#   r$   r%   r&   r'   r(   vllm.sequencer)   vllm.tokenizersr*   vllm.transformers_utils.configsr+   vllm.utils.tensor_schemar,   r-   
interfacesr/   r0   r1   utilsr2   r3   r4   r5   visionr6   r8   rO   rS   rH   r   r   r   r9  rU   rW   rr   r   r>  rW  re  r  Moduler  r  r  r  r  r  register_processorr  rK   rL   rM   <module>rk     s    7 7 7 7 7 7 7 7 7 7               5 5 5 5 5 5 5 5 5 5 5 5                           " " " " " " ? ? ? ? ? ? C C C C C C C C C C " " " " " " 3 3 3 3 3 3 A A A A A A < < < < < < X X X X X X 7 7 7 7 7 7         
 G F F F F F / / / / / /         
 A @ @ @ @ @ @ @                . - - - - - ) ) ) ) ) ) D D D D D D > > > > > > > > L L L L L L L L L L            0 / / / / /< < < < <l < < <(	? 	? 	? 	? 	?, 	? 	? 	? !8:U U I U U Ud5;&7cT9IIJ    *H *H *H *H *H *H *H *HZp p p p p p p pf}
 }
 }
 }
 }
 }
 }
 }
@&
 &
 &
 &
 &
. &
 &
 &
R
 
 
 
 
 67L M 
 
 
41
 1
 1
 1
 1
!89N!O 1
 1
 1
h  @2 2 2 2 2BI 2 2 2j= = = = =29 = = =@         RY      F         bi      F       B    RY   @ ('	*  
LK LK LK LK LKbi1CZ LK LK 
LK LK LKrL   