
    .`i9                        U d dl Z d dlmZmZmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZA dZB G d de,          ZC G d de,          ZD G d de,          ZEeDeEz  ZFe
eGd<   eFeCz  ZHe
eGd<    G d d e9e	          ZI G d! d"e:          ZJ G d# d$e5eJ                   ZK G d% d&e8eJ                   ZL G d' d(ejM                  ZN ejO        eLeJeK)           G d* d+ejM        e2e3                      ZPdS ),    N)IterableMappingSequence)	AnnotatedFinalLiteralProtocol	TypeAlias)BatchFeatureLlavaOnevisionConfigLlavaOnevisionProcessor)get_anyres_image_grid_shapeunpad_image)
VllmConfig)BaseDummyOptions)
get_act_fn)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)	ImageSizeMultiModalDataItemsVideoEmbeddingItemsVideoProcessorItems)PromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )CLIPVisionModel)MultiModalEmbeddingsSupportsMultiModal
SupportsPP)LlavaDummyInputsBuilderinit_vision_tower_for_llava) BaseLlavaNextMultiModalProcessorLlavaNextLikeConfigLlavaNextProcessingInfo)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix   c            
           e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddddh	          f         ed<   d
S )LlavaOnevisionVideoPixelInputsal  
    Dimensions:
        - bn: Batch size * number of videos
        - f: Number of frames
        - c: Number of channels (3)
        - h: Height
        - w: Width

        Note that `f` may be different for each batch, and 'num_frames'
        may be different for each video, in which case the data is passed as a
        list instead of a batched tensor.
    pixel_values_videostypebnf   hwdynamic_dimsN__name__
__module____qualname____doc__r3   r   __annotations__r   torchTensorlistr        ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/llava_onevision.pyr1   r1   7   s           ,AD''
(@@@"tEL))D#q#s#???	A     rE   r1   c            
           e Zd ZU dZdZed         ed<   eej	        e
ej	                 z   eddddddh	          f         ed<   eej	        d
z   edd          f         ed<   d
S )LlavaOnevisionImagePixelInputsaU  
    Dimensions:
        - bn: Batch size * number of images
        - np: Number of patches (1 + num_patches)
        - c: Number of channels (3)
        - h: Height
        - w: Width

        Note that `num_patches` may be different per batch and image,
        in which case the data is passed as a list instead of a batched tensor.
    pixel_valuesr3   r4   npr6   r7   r8   r9   N   image_sizesr;   rD   rE   rF   rH   rH   M   s         
 
 %3D'.
!222tEL))D$34&AAA	C   
 5<$.D!0D0DDEEEEEErE   rH   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	"LlavaOnevisionImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsr3   r4   ifshsdataN)r<   r=   r>   r?   r3   r   r@   r   rA   rB   r   rD   rE   rF   rN   rN   d   si           %3D'.
!222
D%&&	(     rE   rN   LlavaOnevisionImageInputsLlavaOnevisionMultiInputsc                   &    e Zd ZU ee         ed<   dS )LlavaOnevisionLikeConfigvideo_token_indexN)r<   r=   r>   r   intr@   rD   rE   rF   rV   rV   }   s"         Sz!!!!!rE   rV   c                      e Zd ZdefdZdefdZdeee	dz  f         fdZ
de	de	d	e	d
e	de	dee	e	f         fdZdefdZde	de	de	fdZde	de	de	de	fdZde	de	fdZde	deee	f         de	fdZde	deee	f         de	fdZdS )LlavaOnevisionProcessingInforeturnc                 @    | j                             t                    S N)ctxget_hf_configr   selfs    rF   r_   z*LlavaOnevisionProcessingInfo.get_hf_config   s    x%%&:;;;rE   kwargsc                 2     | j         j        t          fi |S r]   )r^   get_hf_processorr   )ra   rb   s     rF   rd   z-LlavaOnevisionProcessingInfo.get_hf_processor   s     (tx()@KKFKKKrE   Nc                     d d dS )NimagevideorD   r`   s    rF   get_supported_mm_limitsz4LlavaOnevisionProcessingInfo.get_supported_mm_limits   s    ---rE   original_heightoriginal_widthnpatchesnum_patch_heightnum_patch_widthc                   ||z  }||z  }||z  }||z  }	||	k    r4t          t          |||z  z  d                    }
||
z
  dz  }|d|z  z
  }n3t          t          |||z  z  d                    }||z
  dz  }|d|z  z
  }||z  }|}t          j        ||z  d|dz  z  z            }|dk    r+t          ||z            }t          ||z            }||z  }|}||fS )N   rK   	   皙?)rX   roundmathsqrt)ra   rj   rk   rl   rm   rn   current_heightcurrent_widthaspect_ratiocurrent_aspect_ratio
new_heightpadding	new_widthunpadded_featuresnewline_featuresratioheight_factorwidth_factors                     rF   _get_num_unpadded_featuresz7LlavaOnevisionProcessingInfo._get_num_unpadded_features   sB    "$44 ?2%7,~=...o)GH!LL J &
2q8G+q7{;NNn(HI1MM I %y0Q6G)Q[9M*]:)	.=8A!OLMM3;;% 788M}566L - <,!#344rE   c                 $    t          dd          S )Ni  i  )widthheight)r   r`   s    rF   !get_image_size_with_most_featuresz>LlavaOnevisionProcessingInfo.get_image_size_with_most_features   s    tC0000rE   image_widthimage_heightc                    |                                  }t          |dd          }|                                 }|                                }t	          j        ||z            }||z  S )Nspatial_pool_striderK   )r_   getattrget_vision_encoder_infoget_patch_grid_lengthrt   ceil)ra   r   r   	hf_configr   vision_encoder_infopatch_grid_lengthpooled_grid_lengths           rF   _get_num_frame_tokensz2LlavaOnevisionProcessingInfo._get_num_frame_tokens   so     &&((	%i1FJJ"::<</EEGG!Y'8;N'NOO!$666rE   
num_framesc                @    |                      ||          }||z  dz   S )N)r   r   r    )r   )ra   r   r   r   num_frame_tokenss        rF   get_num_video_tokensz1LlavaOnevisionProcessingInfo.get_num_video_tokens   s7      55#% 6 
 

  *,q00rE   
max_tokensc                     |                                  \  }}d}	 |dz   }|                     |||          }||k    rn|}'|S )Nr   Tr    r   r   r   )r   r   )ra   r   target_widthtarget_heightr   next_num_framesnext_max_tokenss          rF   _get_max_video_framesz2LlavaOnevisionProcessingInfo._get_max_video_frames   sp    &*&L&L&N&N#m
	)(1nO"77(** 8  O ++(J	) rE   seq_len	mm_countsc                     |                     dd          }|                     |          }t          |t          |d          z  t                    }t          |d          S )Nrh   r   r    )getr   minmax_MAX_FRAMES_PER_VIDEO)ra   r   r   
max_videosmax_total_framesmax_frames_per_videos         rF   !get_num_frames_with_most_featuresz>LlavaOnevisionProcessingInfo.get_num_frames_with_most_features   sc    
 ]]7A..
55g>>"J 2 224I 
  
 '+++rE   c                     |                                  \  }}|                     |||                     ||                    S )Nr   )r   r   r   )ra   r   r   r   r   s        rF   get_max_video_tokensz1LlavaOnevisionProcessingInfo.get_max_video_tokens   sP    
 '+&L&L&N&N#m(($&==gyQQ ) 
 
 	
rE   )r<   r=   r>   rV   r_   objectrd   r   strrX   ri   tupler   r   r   r   r   r   r   r   rD   rE   rF   rZ   rZ      s       <7 < < < <L L L L L.cDj)A . . . .
&5 &5 	&5
 &5 &5 &5 
sCx&5 &5 &5 &5P19 1 1 1 17 7 	7
 
7 7 7 71 1 	1
 1 
1 1 1 1     (,, 38$, 
	, , , ,

 38$
 
	
 
 
 
 
 
rE   rZ   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	 LlavaOnevisionDummyInputsBuilderr   r[   c                     |                     dd          }|                     dd          }| j                                        }|j        }|j        }||z  ||z  z   S )Nrg   r   rh   )r   inford   image_tokenvideo_token)ra   r   
num_images
num_videos	processorr   r   s          rF   get_dummy_textz/LlavaOnevisionDummyInputsBuilder.get_dummy_text	  s`    ]]7A..
]]7A..
I..00	++Z'+
*BBBrE   Nr   
mm_optionsc                    |                     dd          }|                     dd          }| j                                        \  }}| j                            ||          }|r|                     d          nd }	|r|                     d          nd }
|                     ||||	          |                     |||||
          dS )Nrg   r   rh   )r   r   r   	overrides)r   r   r   r   r   rf   )r   r   r   r   _get_dummy_images_get_dummy_videos)ra   r   r   r   r   r   r   r   target_num_framesimage_overridesvideo_overridess              rF   get_dummy_mm_dataz2LlavaOnevisionDummyInputsBuilder.get_dummy_mm_data  s     ]]7A..
]]7A..
&*i&Q&Q&S&S#m IGGY
 
 6@I*..111T5?I*..111T ++"$%)	 ,   ++"$,%) ,  
 
 	
rE   r]   )
r<   r=   r>   r   r   rX   r   r   r   r   rD   rE   rF   r   r     s        CS(9 Cc C C C C =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
rE   r   c            
           e Zd Zdedeeef         deeef         fdZdedeeef         deeef         deeef         def
 fd	Z	d
ede
deeef         deeef         def
 fdZde
deeef         dedee         f fdZ xZS )!LlavaOnevisionMultiModalProcessor	hf_inputshf_processor_mm_kwargsr[   c                     t          t          j        d          t          j        d          t          j        d          t          j        d                    S )Nrg   rh   )rI   rL   rO   r2   )dictr   batched)ra   r   r   s      rF   _get_mm_fields_configz7LlavaOnevisionMultiModalProcessor._get_mm_fields_config8  sS    
 .6w??-5g>>.6w?? 5 =g F F	
 
 
 	
rE   promptmm_data	mm_kwargs
tok_kwargsc                 R   t          |          }|                    dg           }t          |t                    sJ |s%t	                                          ||||          S | j                                        }|j        }|j	        }t	                                          |i ||          }	|                    dg           }
t          |
t                    sJ |
rVt	                                          |t          |
          z  d|
i||          }d |                                D             }ni }g }|D ]J}t	                                          |d|i||          }|                    |d         d                    Kd|i}t          |	fi ||}t          |          S )Nvideos)r   r   r   r   imagesc                 "    i | ]\  }}|d v 	||S ))rI   rL   rD   ).0kvs      rF   
<dictcomp>zHLlavaOnevisionMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>o  s4       Aq777 1777rE   r2   r   )r   pop
isinstancerC   super_call_hf_processorr   rd   r   r   lenitemsappendr   )ra   r   r   r   r   r   r   r   r   text_outputsr   processor_outputsimage_outputsr2   rh   item_outputsvideo_outputscombined_outputs	__class__s                     rF   r   z4LlavaOnevisionMultiModalProcessor._call_hf_processorD  s    w--Xr**&$''''' 	77--#%	 .    I..00	++ww11!	 2 
 
 Xr**&$''''' 	 % : :"S[[0!6*#%	 !; ! ! -3355  MM M  	O 	OE 7755"!5)#%	 6  L  &&|4I'J1'MNNNN.0CD
 

 
 

 ,---rE   prompt_textmm_itemstokenization_kwargsc                     t                                          ||||          }|o|                    dd          dk    S )N)r   r   r   r   rh   F)strictr   )r   _hf_processor_applies_updates	get_count)ra   r   r   r   r   base_resultr   s         rF   r   z?LlavaOnevisionMultiModalProcessor._hf_processor_applies_updates  sT     gg;;##9 3	 < 
 
 Mx11'%1HHAMMrE   out_mm_kwargsc                      t                                          ||          } j                                        }|j        dt
          f fd}g |t          dg|          S )N)r   r   r   item_idxc                 H                        dt          t          f          }t          |t                    r|                    |           }nO|                    |           }j                            |j        |j	        |
                    |                     }g|z  S )Nrh   r   )	get_itemsr   r   r   get_feature_sizeget_frame_sizer   r   r   r   get_num_frames)r   r   num_video_tokens
image_sizer   ra   video_token_ids       rF   get_video_replacementzTLlavaOnevisionMultiModalProcessor._get_prompt_updates.<locals>.get_video_replacement  s    ''-/BC F &"566 #)#:#:8#D#D  #228<<
#'9#A#A * 0!+!2%44X>> $B $ $  ##&666rE   rh   )modalitytargetreplacement)r   _get_prompt_updatesr   r_   rW   rX   r   )	ra   r   r   r   image_replsr   r   r   r   s	   ``     @rF   r   z5LlavaOnevisionMultiModalProcessor._get_prompt_updates  s     gg11#9' 2 
 
 I++--	"4	7C 	7 	7 	7 	7 	7 	7 	7 	7"

 &'1  
 	
rE   )r<   r=   r>   r   r   r   r   r   r   r   r   boolr   r   r   r   r   __classcell__r   s   @rF   r   r   5  s       



 !(V 4

 
++	,	

 

 

 

E.E. f%E. 3;'	E.
 CK(E. 
E. E. E. E. E. E.NNN &N !(V 4	N
 %S&[1N 
N N N N N N '
%'
 !(V 4'
 -	'

 
,	'
 '
 '
 '
 '
 '
 '
 '
 '
 '
rE   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )!LlavaOnevisionMultiModalProjectorconfigc                 N   t                                                       t          j        |j        j        |j        j        |j                  | _        t          |j
                  | _        t          j        |j        j        |j        j        |j                  | _        d S )N)bias)r   __init__nnLinearvision_confighidden_sizetext_configmultimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)ra   r  r   s     rF   r  z*LlavaOnevisionMultiModalProjector.__init__  s    	 ,*1
 
 

 f9::	**1
 
 
rE   image_featuresr[   c                     |                      |          }|                     |          }|                     |          }|S r]   )r  r  r  )ra   r  hidden_statess      rF   forwardz)LlavaOnevisionMultiModalProjector.forward  s;    n55//m44rE   )	r<   r=   r>   r   r  rA   rB   r  r   r   s   @rF   r  r    sk        
3 
 
 
 
 
 
el u|        rE   r  )r   dummy_inputsc                       e Zd Z edddddd          Zeded	ed
edz  fd            Zddde	ded
df fdZ
ded
edz  fdZded
edz  fdZded
efdZdeez  dej        d
ej        fdZddddej        dej        ded
ej        fdZded
ej        eej                 z  fd Zd!ed
ej        eej                 z  fd"Zdeez  dej        d
ej        fd#Zdefd$Zd3d&ej        d'efd(Zded
efd)Z 	 	 d4d*ej        d+ej        d,e!dz  d-ej        dz  ded
ej        e!z  fd.Z"d/ej        d
ej        dz  fd0Z#d1e$e%eej        f                  d
e&e         fd2Z' xZ(S )5&LlavaOnevisionForConditionalGenerationzlanguage_model.model.zvision_tower.zmulti_modal_projector.image_newlinezlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zmodel.image_newlinezlm_head.)orig_to_new_prefixr   ir[   Nc                 |    |                     d          rdS |                     d          rdS t          d          )Nrg   z<image>rh   z<video>z)Only image or video modality is supported)
startswith
ValueError)clsr   r  s      rF   get_placeholder_strz:LlavaOnevisionForConditionalGeneration.get_placeholder_str  sG    w'' 	9w'' 	9DEEErE    )prefixvllm_configr!  c          
         t                                                       |j        j        }|j        }|j        j        }|| _        || _        |                     |ddh          5  t          ||dt          |d                    | _
        t          j        t          j        |j        j                            | _        t%          |          | _        d d d            n# 1 swxY w Y   |                     |          5  t+          ||j        t          |d                    | _        d d d            n# 1 swxY w Y   | j        j        j        | _        d S )Nrg   rh   Fvision_tower)quant_configrequire_post_normr!  language_model)r"  r   r!  )r   r  model_configr   r%  multimodal_configr  _mark_tower_modelr&   r.   r$  r  	ParameterrA   emptyr  r
  r  r  multi_modal_projector_mark_language_modelr-   r'  modelmake_empty_intermediate_tensors)ra   r"  r!  r  r%  r)  r   s         rF   r  z/LlavaOnevisionForConditionalGeneration.__init__  s   )3"/'4F!2##K'71CDD 	S 	S ;)"'#FN;;	! ! !D "$F.:;;" "D *K6)R)RD&	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 %E 	,,,s%   'A0C##C'*C'+D::D>D>rb   c                 B   |                     dd           }|                     dd           }|                     dd           }||d S |4t          d||| j        j        j        | j        j        j        d          S |t          d|          S t          d          )NrI   rL   rO   r7   r8   )r3   rI   rL   resolve_bindings)r3   rR   z This line should be unreachable.)r   rH   r  r	  r   rN   AssertionError)ra   rb   rI   rL   rO   s        rF   _parse_and_validate_image_inputzFLlavaOnevisionForConditionalGeneration._parse_and_validate_image_input  s     zz.$77jj55zz.$77L$84#1#)'2=2=" "	    #5#!   
 ?@@@rE   c                     |                     dd          }|dS t          d|| j        j        j        | j        j        j        d          S )z
        A legal video input should have the following dimensions:
        {
            "pixel_values_videos" :
                list[b, Tensor(nb_frames, nb_channels, height, width)]
        }
        r2   Nr2  )r3   r2   r3  )r   r1   r  r	  r   )ra   rb   r2   s      rF   _parse_and_validate_video_inputzFLlavaOnevisionForConditionalGeneration._parse_and_validate_video_input4  sa     %jj)>EE&4-& 3[.9[.9 
 
 
 	
rE   c                 t    i }|D ]2}|dv rd|vr | j         di ||d<   |dv rd|vr | j        di ||d<   3|S )N)rI   rO   rg   )r2   video_embedsrh   rD   )r5  r7  )ra   rb   mm_input_by_modality	input_keys       rF   %_parse_and_validate_multimodal_inputszLLlavaOnevisionForConditionalGeneration._parse_and_validate_multimodal_inputsK  s    !   	 	I===#7770T0T 1 11 1$W- DDD#7770T0T 1 11 1$W- $#rE   r$  rI   c                 0     ||| j         j                  S N)feature_select_strategy)r  vision_feature_select_strategy)ra   r$  rI   s      rF   _image_pixels_to_featuresz@LlavaOnevisionForConditionalGeneration._image_pixels_to_featuresb  s(     |$(K$N
 
 
 	
rE   anyres_max_9)r  vision_aspect_ratior   patch_embeddingsstrategyc          	         |dk    r|                     dd          S |                    d          r| j        j        j        | j        j        j        z  x}}|d         }||z  |j        d         k    rt          d          |j        d         dk    r_|dd          }	|                                \  }
}t          |
|f| j        j
        | j        j        j                  \  }}||z  }|	d |                             ||||d          }	d|v r|	                    ddd	dd
                                                               dd	                               d	d
          }	t          |	|
|f          }	t          |                    d                    }|	j        \  }}}t#          j        ||z  ||d	z  z  z            }|dk    rR|	d          }	t&          j                            |	t          ||z            t          ||z            gd          d         }	|Ut-          j        |	 |d d d d f         j        g |	j        d d         dR                      |	j                  fd          }	|	                     dd	                              dd          }	n?|	                    dd	dd
d                                                               dd
          }	t-          j        ||	fd          }nBd|v r<t-          j        || j        d                              |j                  fd          }n|}|S t          d|           )Nflatr   r    spatialz<The number of patches is not consistent with the image size.unpad   rK   r6   anyres_max_rr   bilinear)modedimz!Unexpected patch merge strategy: )flattenr  r  r	  r   
patch_sizeshaper  tolistr   image_grid_pinpointsviewpermute
contiguousr   rX   removeprefixrt   ru   r  
functionalinterpolaterA   catexpandtodevice	transposer  )ra   r   rD  r  rC  rE  r   r   base_patch_embedsother_patch_embedsorig_height
orig_widthrm   rn   num_patchesmax_num_patcheschannelscurr_height
curr_widthr   merged_patch_embeddingss                        rF   _merge_image_patch_embeddingszDLlavaOnevisionForConditionalGeneration._merge_image_patch_embeddingso  s    v#++Aq111y)) Z	+)4;,78FU
 !1 3~!2!8!;;; R    %a(1,,%5abb%9" +5*;*;*=*='Z 5P *-K4K-85 51 /
 /@ &8%E%J%J$ovub& &" h&&*221aAqAA# A A	 ' *5*[*,E* *& '*+88GG' 'O 9K8P5Hk: I#j0Ofai4OP E s{{-?-E*-/]-F-F. !566J%<O8P8PQ!+ .G . . 	.*
 %0-2Y 2!(aaatm <!'!K);)A#2#)F!KHI!K !K !K!#$6$=!>!>	 !#. . .* *<)C)CAq)I)I)S)S1* *&&
 +221aAqAA# A ' +0)&(:;+ + +'' h&&.3i- .t4778I8PQQ / / /++ /@+**GXGGHHHrE   inputsc                     |d         }t          |t          j                  rt|j        \  }}}}}|                    ||z  |||          }                      j        |          }	                     |	          }
 |
j        ||g|
j        dd          R  S d |D             }t          j        |          }                      j        |          }	 fdt          j	        |	|          D             S )NrI   r    c                 (    g | ]}|j         d          S )r   )rS  )r   r   s     rF   
<listcomp>zPLlavaOnevisionForConditionalGeneration._process_image_pixels.<locals>.<listcomp>  s     B B B B B BrE   c                 :    g | ]}                     |          S rD   )r-  )r   r  ra   s     rF   ro  zPLlavaOnevisionForConditionalGeneration._process_image_pixels.<locals>.<listcomp>  s7     
 
 
 &&~66
 
 
rE   )
r   rA   rB   rS  rV  rA  r$  r-  r\  split)ra   rl  rI   bre  cr7   r8   stacked_pixel_valuesstacked_image_featuresstacked_patch_embeddingsnum_patches_per_batchs   `           rF   _process_image_pixelsz<LlavaOnevisionForConditionalGeneration._process_image_pixels  sI    n-lEL11 	&2&8#A{Aq!#/#4#4Q_aA#N#N %)%C%C!#7& &" (,'A'A&( ($ 1+0;!9!?!C    !C B\ B B B$y66!%!?!?3"
 "

 
 
 
"'+&(=# #
 
 
 	
rE   image_inputc                 r    |d         dk    r|d         S                       |          }|                    d          Xt          |d                   } j        j        }|j        xt          j        fdt          |          D                        fdt          |          D             S )Nr3   rO   rR   rL   rI   c                     g | ]}gS rD   rD   )r   _default_heightdefault_widths     rF   ro  zOLlavaOnevisionForConditionalGeneration._process_image_input.<locals>.<listcomp>	  s    LLLQ.-0LLLrE   c                 ^    g | ])\  }}                     |         |j        d           *S )spatial_unpad)r  rE  )rk  r  )r   r  patch_features_batchrL   ra   s      rF   ro  zOLlavaOnevisionForConditionalGeneration._process_image_input.<locals>.<listcomp>  sX     
 
 
 (' ..A$"0(	 /  
 
 
rE   )
rx  r   r   r  r	  r   rA   	as_tensorrange	enumerate)ra   ry  rD  
batch_sizer	  r}  r~  rL   s   `    @@@rF   _process_image_inputz;LlavaOnevisionForConditionalGeneration._process_image_input  s     v.00v&&55kBB!oom44[899J K5M-:-EEN]/LLLLL%
:K:KLLL K
 
 
 
 
 ,55E+F+F
 
 
 	
rE   c                      ||| j         j                  }|                     |          }|                     |          }|S r>  )r  r@  r-  apply_pooling)ra   r$  rI   video_featuress       rF   _video_pixels_to_featuresz@LlavaOnevisionForConditionalGeneration._video_pixels_to_features  sS     &$(K$N
 
 
 33NCC++N;;rE   c                   
 |d         }t          |t          j                  r|j        \  }}}}}|                    ||z  |||          }|                     | j        |          

                    ||
j        d         z  d          
| j        d d d d f         	                    |dd          t          j
        
fd          S d |D             }	t          j
        |          }|                     | j        |          
| j        d d d d f         
fdt          |	t          j        
|	                    D             S )Nr2   r    rI  rO  c                 ,    g | ]}t          |          S rD   )r   )r   rh   s     rF   ro  zPLlavaOnevisionForConditionalGeneration._process_video_pixels.<locals>.<listcomp>9  s    AAA5CJJAAArE   c           
          g | ]@\  }}t          j        |                    d |j        d          z  d          fd           AS )r    rI  rO  )rA   r\  reshaperS  )r   	num_frameembedsembeddings_flatr  s      rF   ro  zPLlavaOnevisionForConditionalGeneration._process_video_pixels.<locals>.<listcomp>B  sm     
 
 
 "	6 INN1i/2G2J&JBOO!   
 
 
rE   )r   rA   rB   rS  rV  r  r$  r  r  r]  r\  ziprq  )ra   rl  video_pixelstotal_videosframesrs  r7   r8   video_pixels_flatframes_per_videor  r  s             @@rF   _process_video_pixelsz<LlavaOnevisionForConditionalGeneration._process_video_pixels%  s   34lEL11 	F,8,>)L&!Q , 1 1,2GAq Q Q"<<!#4 O .55f'<Q'?? O !.tT111}=DDb" M 9o}=1EEEEAALAAA!Il33880
 
 *4qqq=9
 
 
 
 
 &) O-=>>& &
 
 
 	
rE   rK   r  stridec                    | j         j        }|j        |j        z  x}}|j        \  }}}|                    |||d          }|                    dddd          }|j        dd          \  }}t          j        ||z            t          j        ||z            g}	t          j
                            ||	d          }
|
                    dddd          }
|
                    |d|          }
|
S )NrI  r   r6   r    rK   rM  )sizerN  )r  r	  r   rR  rS  rV  rW  rt   r   r  rZ  r[  )ra   r  r  r	  r   r   batch_framesr|  rP  scaled_shapeimage_features              rF   r  z4LlavaOnevisionForConditionalGeneration.apply_poolingP  s    1&1]5MMM-3a',,\65"MM'//1a;; ',QRR0	&6/22DIefn4M4MN11J 2 
 
 &--aAq99%**<SAArE   c                      | j         di |}|sg S d}|D ]d}||         }|dk    r'|                     |          }|t          |          z  }|dk    r'|                     |          }|t          |          z  }e|S )NrD   rg   rh   )r<  r  r   r  )ra   rb   r:  multimodal_embeddingsr   multimodal_inputimage_embeddingsvideo_embeddingss           rF   embed_multimodalz7LlavaOnevisionForConditionalGeneration.embed_multimodala  s    ItISSFSS# 	I
 ;= - 	A 	AH3H=7""#'#<#<=M#N#N %/?)@)@@%7""#'#=#=>N#O#O %/?)@)@@%$$rE   	input_ids	positionsintermediate_tensorsinputs_embedsc                 J    |d}| j                             ||||          }|S )zRun forward pass for LlaVA-Onevision.
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            pixel_values_videos: Pixels in each frames for each input videos.
        N)r  )r'  r/  )ra   r  r  r  r  rb   r  s          rF   r  z.LlavaOnevisionForConditionalGeneration.forwardx  s>      + M+11y"6m 2 
 
 rE   r  c                 6    | j                             |          S r]   )r'  compute_logits)ra   r  s     rF   r  z5LlavaOnevisionForConditionalGeneration.compute_logits  s     "11-@@@rE   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r+   load_weightshf_to_vllm_mapper)ra   r  loaders      rF   r  z3LlavaOnevisionForConditionalGeneration.load_weights  s+    "4((""743I"JJJrE   )rK   )NN))r<   r=   r>   r,   r  classmethodr   rX   r  r   r  r   rS   r5  r1   r7  r   r<  r!   r*   rA   rB   rA  rk  rH   rC   rx  r  r  r  r  r"   r  r   r  r  r   r   setr  r   r   s   @rF   r  r    s        & &=#2,D#21
 
	 	 	 F3 F3 F3: F F F [F BD 
 
 
z 
3 
 
 
 
 
 
 
BAA	"T	)A A A A:

	'$	.
 
 
 
.$f $ $ $ $ $.

%(99

 l

 
	

 

 

 

$ *hI hI hILhI  ,hI hI 
hI hI hI hIT
.
 
U\*	*
 
 
 
B
.
 
U\*	*
 
 
 
8%(99 l 
	   )
,J )
 )
 )
 )
V EL #    "% %4H % % % %6 <@-1 < < 2D8	
 |d*  
+	+   .A|A 
	A A A AKHU33D-E$F K3s8 K K K K K K K KrE   r  )Qrt   collections.abcr   r   r   typingr   r   r   r	   r
   rA   torch.nnr  transformersr   r   r   <transformers.models.llava_onevision.modeling_llava_onevisionr   r   vllm.configr   vllm.config.multimodalr   %vllm.model_executor.layers.activationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   r   vllm.multimodal.processingr   r   vllm.sequencer   vllm.utils.tensor_schemar   r   clipr!   
interfacesr"   r#   r$   llavar%   r&   
llava_nextr'   r(   r)   siglipr*   utilsr+   r,   r-   r.   r   r1   rH   rN   rS   r@   rT   rV   rZ   r   r   Moduler  register_processorr  rD   rE   rF   <module>r     s    7 7 7 7 7 7 7 7 7 7 A A A A A A A A A A A A A A        T T T T T T T T T T       
 # " " " " " 3 3 3 3 3 3 < < < < < < / / / / / /         
            G F F F F F F F - - - - - - > > > > > > > > ! ! ! ! ! ! L L L L L L L L L L G G G G G G G G         
 & % % % % %                 \   ,F F F F F\ F F F.       " #%GG 9   
  >> 9   
" " " " "2H " " "B
 B
 B
 B
 B
#: B
 B
 B
J,
 ,
 ,
 ,
 ,
89,
 ,
 ,
^M
 M
 M
 M
 M
$%ABM
 M
 M
`    	   . ('%	%1  
vK vK vK vK vKRY8JJ vK vK 
vK vK vKrE   