
    .`iE                     x   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZ d d	lmZmZmZmZ d d
lmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' de(de(de)de)de*e(e(f         f
dZ+de(de(de*e(e(f         dz  de,e*e(e(f                  fdZ-de(de(de,e*e(e(f                  de(de)de*e(e(e(e*e(e(f         f         fdZ.dej        de,e*e(e(f                  de(de)de*e,ej                 e*e(e(f         f         f
dZ/dej        de(de(de(de)de*e(e(f         dz  de*ej0        e*e(e(f         f         fd Z1dej        de(de(de(de)d!e)dej0        fd"Z2 G d# d$e#          Z3 G d% d&e"          Z4 G d' d(e!e4                   Z5 ej6        e5e4e )           G d* d+e$                      Z7dS ),    )MappingSequenceN)Image)PretrainedConfig)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)MultiModalProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)TokenizerLike   )InternVisionModel)IMG_CONTEXTIMG_END	IMG_STARTBaseInternVLDummyInputsBuilderBaseInternVLMultiModalProcessorBaseInternVLProcessingInfoBaseInternVLProcessorInternVLChatModelbuild_transformfind_closest_aspect_ratioget_internvl_target_ratiosmin_dynamic_patchmax_dynamic_patchdynamic_image_sizeuse_thumbnailreturnc                 <    |r| nd} |r|nd}|r|dk    r|dz  }| |fS )Nr    r    r!   r"   r#   s       t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/h2ovl.pyresolve_h2ovl_min_max_numr)   1   sQ     .@F))Q-?F))Q *a//Q///    min_nummax_numprior_aspect_ratioc                H    t          | |          }fd|D             }|S )Nc                 h    g | ].}d          |d          z  d k    rd         |d         z  d k    ,|/S )r   r   r&   ).0ratior-   s     r(   
<listcomp>z+get_h2ovl_target_ratios.<locals>.<listcomp>K   sW     
 
 
!!$uQx/144"1%a0A55  655r*   )r   )r+   r,   r-   target_ratioss     ` r(   get_h2ovl_target_ratiosr4   A   sL     /w@@M %
 
 
 
&
 
 
 r*   
orig_widthorig_heightr3   
image_sizec                     | |z  }t          ||| ||          }||d         z  }||d         z  }|d         |d         z  }	|r|	dk    r|	dz  }	|	|||fS )N)widthheightr7   r   r   )r   )
r5   r6   r3   r7   r#   aspect_ratiotarget_aspect_ratiotarget_widthtarget_heightblockss
             r(   calculate_h2ovl_targetsr@   V   s     +L 4    3A 66L!4Q!77M #&9!&<<F  1!<0CCCr*   imagec                   | j         \  }}t          ||||d          \  }}}}	|                     ||f          }
g }t          |          D ]\}|||z  z  |z  |||z  z  |z  |||z  z  dz   |z  |||z  z  dz   |z  f}|
                    |          }|                    |           ]t          |          |k    sJ |r?t          |          dk    r,|                     ||f          }|                    |           ||	fS )NF)r5   r6   r3   r7   r#   r   )sizer@   resizerangecropappendlen)rA   r3   r7   r#   r5   r6   r?   r=   r>   r<   resized_imgprocessed_imagesibox	split_imgthumbnail_imgs                   r(   dynamic_preprocess_h2ovlrO   w   sg    $jJ 	 #	 	 	 ,,m<==K6]] 	+ 	+,*,-;<:-.*<<:-.!3zALJ./14
B	
  $$S))		****  F**** /-..!33j*%=>>...000r*   
input_sizec                   
 t          |||          }t          |          
t          | |||          \  }}t          j        
fd|D                       }	|	|fS )Nr-   )rP   )r7   r#   r3   c                 &    g | ]} |          S r&   r&   )r0   rA   	transforms     r(   r2   z%_preprocess_image.<locals>.<listcomp>   s#    EEEU		% 0 0EEEr*   )r4   r   rO   torchstack)rA   rP   r+   r,   r#   r-   r3   imagesr<   pixel_valuesrT   s             @r(   _preprocess_imagerY      s     ,-  M  :666I":##	# # #F ;EEEEfEEEFFL,,,r*   use_msacc                    |rat          | |d|dd           \  }}t          | |d|d|          \  }}	t          j        |d d         |d d         |dd          gd          }
nt          | ||||d           \  }
}	|
S )Nr   T)rP   r+   r,   r#   r-      r   )rY   rU   cat)rA   rP   r+   r,   r#   rZ   pixel_values1aspect_ratio1pixel_values2_rX   s              r(   image_to_pixel_values_h2ovlrc      s       
'8!#(
 (
 (
$} -!,
 
 
q y3B3ss!3]2335GH!
 

 ,!'#
 
 
a r*   c                       e Zd Zddddddedededz  dedz  dedz  dedz  d	df fd
Zed	efd            Z	dededz  d	e
e         fdZddddddedz  dedz  dedz  dedz  d	eeef         f
dZddddddddedz  dedz  dedz  dedz  deeef         dz  dedz  d	eeeef                  fdZdddedededz  d	efdZ	 	 	 ddeej                 dedz  dedz  dedz  d	eej                 f
dZ xZS )H2OVLProcessorN)r    r!   r"   rZ   config	tokenizerr    r!   r"   rZ   r$   c                    t                                          |||||           ||j        }t          |t                    sJ || _        d S )N)r    r!   r"   )super__init__rZ   
isinstancebool)selfrf   rg   r    r!   r"   rZ   	__class__s          r(   rj   zH2OVLProcessor.__init__   se     	//1 	 	
 	
 	
 H(D))))) r*   c                 J    | j                                         t                   S N)rg   	get_vocabr   )rm   s    r(   image_token_idzH2OVLProcessor.image_token_id	  s    ~''))+66r*   feature_sizenum_patchesc                 n    t           |z  }t          |z   t          z   }t          j        |t                     S rp   )r   r   r   r   select_text)rm   rs   rt   repl_features	repl_fulls        r(   get_image_replzH2OVLProcessor.get_image_repl  s0    
 $l2-7	".y+FFFr*   r'   r#   c                    || j         n|}|| j        n|}|| j        n|}|| j        n|}t	          ||||          S )Nr'   )r    r!   r"   r#   r)   )rm   r    r!   r"   r#   s        r(   resolve_min_max_numz"H2OVLProcessor.resolve_min_max_num  s     '8&?D""EV 	 '8&?D""EV 	
 ") ### 	
 /<.C**(//1'	
 
 
 	
r*   )r    r!   r"   r#   r-   override_min_numr-   r|   c                f    |                      ||||          \  }}||}t          |||          S )Nr'   rR   )r{   r4   )	rm   r    r!   r"   r#   r-   r|   r+   r,   s	            r(   resolve_target_ratiosz$H2OVLProcessor.resolve_target_ratios3  s\      33//1'	 4 
 
 '&G&1
 
 
 	
r*   rZ   image_widthimage_heightc                   || j         n|}| j        }|rt|                     dd          }t          ||| j        |d          \  }}}}|                     d|d          }	t          ||| j        |	d          \  }
}}}||
z   dz
  }n4|                     d          }t          ||| j        ||          \  }}}}|| j        z  S )	NFr   )r#   r|   T)r5   r6   r7   r3   r#   r\   )r#   r-   r|   )r#   )rZ   r#   r~   r@   r7   num_image_token)rm   r   r   rZ   r#   target_ratios_1num_patches_1rb   aspect_ratio_1target_ratios_2num_patches_2rt   r3   s                r(   get_num_image_tokensz#H2OVLProcessor.get_num_image_tokensL  s:    %-$44==(* %	"88#!" 9  O 3J&(?-"3 3 3/M1a #88##1!" 9  O
 &=&(?-"& & &"M1a (-7!;KK 66# 7  M $;&(?++$ $ $ KAq T111r*   rW   c                      t          |          dk    r j        nd                     |||d          \   fd|D             S )Nr   Fr'   c           
      N    g | ]!}t          |j        j                   "S ))rP   r+   r,   r#   rZ   )rc   r7   r#   )r0   rA   r,   r+   rm   rZ   s     r(   r2   z>H2OVLProcessor._images_to_pixel_values_lst.<locals>.<listcomp>  sQ     

 

 

  (?"0!  

 

 

r*   )rH   rZ   r{   )rm   rW   r    r!   r"   r,   r+   rZ   s   `    @@@r(   _images_to_pixel_values_lstz*H2OVLProcessor._images_to_pixel_values_lst  s     %(KK1$4$44==%33//1	 4 
 


 

 

 

 

 

 

  

 

 

 
	
r*   )NNN)__name__
__module____qualname__r   r   intrl   rj   propertyrr   r   strry   tupler{   listr~   r   r   rU   Tensorr   __classcell__rn   s   @r(   re   re      s        )-(,*. $! ! ! ! !!
 :! :! !4K! +! 
! ! ! ! ! !0 7 7 7 7 X7GG 4ZG 
S	!	G G G G )-(,*.%)
 
 
 :
 :	

 !4K
 d{
 
sCx
 
 
 
> )-(,*.%)59'+
 
 
 :
 :	

 !4K
 d{
 "#s(Od2
 *
 
eCHo	
 
 
 
< !%22 22 22 22 	22
 +22 
22 22 22 22n )-(,*.
 
U[!
 :
 :	

 !4K
 
el	
 
 
 
 
 
 
 
r*   re   c                   L    e Zd ZdedefdZdddedededz  d	edz  def
d
ZdS )H2OVLProcessingInfokwargsr$   c                      | j         j        t          f|                                 |                                 d|S )N)rf   rg   )ctxinit_processorre   get_hf_configget_tokenizer)rm   r   s     r(   get_hf_processorz$H2OVLProcessingInfo.get_hf_processor  sP    &tx&
%%''((**
 
 	
 
 	
r*   Nr   r   r   	processorrZ   c                ^    ||                                  }|                    |||          S )N)r   r   rZ   )r   r   )rm   r   r   r   rZ   s        r(   r   z(H2OVLProcessingInfo.get_num_image_tokens  sA     --//I--#% . 
 
 	
r*   )	r   r   r   objectre   r   r   rl   r   r&   r*   r(   r   r     s        
 
N 
 
 
 
 !%
 
 
 
 	

 "D(
 +
 

 
 
 
 
 
r*   r   c                        e Zd Zdedeeef         dedee	         fdZ
	 ddeee         z  dedeeef         d	eeef         d
edz  deee         eef         f fdZ xZS )H2OVLMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr$   c                 x      j         j        di ||                                }d|v r9|d         t          t          j                  sJ                                 n d|v rd gt          |d                   z  ng t                    dt          f fd}t          dd|          gS )	Nimage_num_patchesimage_embedsitem_idxc                                         dt          t          f          }t          |t                    r|                    |           }nE|                    |           }	j                            |j        |j	        dk    rd nd          }|          }|t          |t                    sJ                     ||          S )NrA   r   F)r   r   r   rZ   )	get_itemsr   r   rk   get_feature_sizeget_image_sizeinfor   r9   r:   r   ry   )
r   rW   rs   r7   rt   hf_processorr   r   
num_imagesrm   s
        r(   get_replacement_internvlzNH2OVLMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_internvl  s    ''-/BC F &"566 	%66x@@#228<<
#y== * 0!+!2*%/1__TT%	  >     ,H5K&!+s33333..|[IIIr*   rA   z<image>)modalitytargetreplacementr&   )
r   r   get_datark   rU   r   tolistrH   r   r   )	rm   r   r   r   out_mm_datar   r   r   r   s	   ``    @@@r(   _get_prompt_updatesz,H2OVLMultiModalProcessor._get_prompt_updates  s#    2ty1KK4JKK#,,..+-- +,? @/>>>>> 1 8 8 : :{** "&[-H)I)I I "*++
	Js 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J.   4  
 	
r*   Npromptmm_data_itemstokenization_kwargsmm_uuidsc                     |                     dd          dk    r|                     |||||          S t                                          |||||          S )NrA   F)strictr   )r   r   r   r   r   )	get_count_apply_hf_processorri   _cached_apply_hf_processor)rm   r   r   r   r   r   rn   s         r(   r   z3H2OVLMultiModalProcessor._cached_apply_hf_processor  s     ""75"99A==+++'=$7! ,    ww11'#9 3 2 
 
 	
r*   rp   )r   r   r   r   r   r   r   r	   r   r   r   r   r   r
   r   r   rl   r   r   r   s   @r(   r   r     s        2
%2
 !(V 42
 -	2

 
,	2
 2
 2
 2
t /3
 
d3i
 +
 !(V 4	

 %S&[1
 %t+
 
tCy2D8	9
 
 
 
 
 
 
 
 
 
r*   r   )r   dummy_inputsc                   ,    e Zd Zdededz  dedefdZdS )H2OVLChatModelrf   quant_configNis_monoprefixc                    |s=|j         }|dk     r|j        j        |z   dz   }n|dz   }t          |j        |||          S d}t	          |          )Nr   r   )r   num_hidden_layers_overrider   z(Monolith mode is not applicable to H2OVL)select_layervision_confignum_hidden_layersr   NotImplementedError)rm   rf   r   r   r   vision_feature_layerr   msgs           r(   _init_vision_modelz!H2OVLChatModel._init_vision_model  s      	+#)#6 #a''(:=QQTUU "! %91$<!$$)+<	    =C%c***r*   )r   r   r   r   r   rl   r   r   r&   r*   r(   r   r     sS        + + )4/+
 + + + + + + +r*   r   )8collections.abcr   r   rU   PILr   transformersr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr	   r
   vllm.multimodal.parser   r   r   $vllm.multimodal.processing.processorr   r   r   r   vllm.tokenizersr   
intern_vitr   internvlr   r   r   r   r   r   r   r   r   r   r   r   rl   r   r)   r   r4   r@   rO   r   rY   rc   re   r   r   register_processorr   r&   r*   r(   <module>r      s   . - - - - - - -        ) ) ) ) ) ) F F F F F F / / / / / / L L L L L L L L         
            * ) ) ) ) ) ) ) ) ) ) )                         00 0 	0
 0 38_0 0 0 0  c3h$.	
 
%S/   *DD D c3h(	D
 D D 3S%S/)*D D D DB+1;+1 c3h(+1 	+1
 +1 4eCHo-.+1 +1 +1 +1\-;- - 	-
 - - c3h$.- 5<sCx()- - - -8,;, , 	,
 , , , \, , , ,^j
 j
 j
 j
 j
* j
 j
 j
Z
 
 
 
 
4 
 
 
6P
 P
 P
 P
 P
>?RS P
 P
 P
f ('	/  
+ + + + +& + + 
+ + +r*   