
    .`i~W                     J   d dl mZ d dlmZ d dlZd dlmZ d dlmZ	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5 dZ6dZ7dZ8de9fdZ:de;de<e=e9e9f                  de9de9de9de=e9e9f         fd Z>d!e9d"e9de<e=e9e9f                  de9d#e?de=e9e9e9f         fd$Z@d%ej        de<e=e9e9f                  de9d#e?de<ej                 f
d&ZAd'e9d(e9de<e=e9e9f                  fd)ZBd%ej        de9d'e9d(e9d#e?dejC        fd*ZD G d+ d,e          ZE G d- d.e          ZF e"jG        eeF         eFeeF         /           G d0 d1ejH        e0e1e/                      ZIdS )2    )ABC)IterableN)Image)	AutoModelPretrainedConfig)BaseImageProcessorFast)
VllmConfig)QuantizationConfig)	AWQConfig)BaseInternVLDummyInputsBuilderBaseInternVLMultiModalProcessorBaseInternVLProcessingInfoInternVLImageEmbeddingInputsInternVLImageInputsInternVLImagePixelInputsInternVLProcessor)MultiModelKeys)MULTIMODAL_REGISTRYconvert_image_mode)PromptUpdateDetails)IntermediateTensors)TokenizerLike)"cached_image_processor_from_config   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixz<img>z</img><image>
input_sizec                     t          j        t          j        d           t          j        | | ft           j        j                  t          j                    g          S )Nc                 "    t          | d          S )NRGBr   )imgs    z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/nemotron_vl.py<lambda>z!build_transform.<locals>.<lambda>8   s    !3C!?!?     )interpolation)TComposeLambdaResizeInterpolationModeBICUBICToTensorr$   s    r)   build_transformr5   5   s[    9H??@@HZ(8K8S   JLL	
  r+   aspect_ratiotarget_ratioswidthheight
image_sizereturnc                    t          d          }d}||z  }|D ]M\  }}	||	z  }
t          ||	z  |z  |z  |z  d          }t          |
| z  | |
z            }||z  }||k    r|}||	f}N|S )Nz-inf)r   r   g333333?)floatmin)r6   r7   r8   r9   r:   best_factor
best_ratioarearwrhtarget_aspect_ratiosize_factorratio_closenessfactors                 r)   find_closest_aspect_ratiorH   B   s     --KJ6>D 
" 
"B 2g27Z/*<DcJJ,.?R0R
 
 .K KbJr+   
orig_widthorig_heightuse_thumbnailc                     | |z  }t          ||| ||          }||d         z  }||d         z  }|d         |d         z  }	|r|	dk    r|	dz  }	|	||fS )N)r8   r9   r:   r   r   )rH   )
rI   rJ   r7   r:   rK   r6   rD   target_widthtarget_heightblockss
             r)   calculate_nemotron_vl_targetsrP   ]   s     +L 4    3A 66L!4Q!77M #&9!&<<F  1!<..r+   imagec                    | j         \  }}t          ||||d          \  }}}|                     ||f          }	g }
t          |          D ]\}|||z  z  |z  |||z  z  |z  |||z  z  dz   |z  |||z  z  dz   |z  f}|	                    |          }|
                    |           ]t          |
          |k    sJ |r?t          |
          dk    r,|                     ||f          }|
                    |           |
S )NF)rI   rJ   r7   r:   rK   r   )sizerP   resizerangecropappendlen)rQ   r7   r:   rK   rI   rJ   rO   rM   rN   resized_imgprocessed_imagesibox	split_imgthumbnail_imgs                  r)   dynamic_preprocess_nemotron_vlr_   |   sZ    $jJ +H#+ + +'FL- ,,m<==K6]] 	+ 	+,*,-;<:-.*<<:-.!3zALJ./14
B	
  $$S))		****  F**** /-..!33j*%=>>...r+   min_nummax_numc                 j      fdt           dz             D             }t          |d           S )Nc                     h | ]E}t          d |d z             D ]/}t          d |d z             D ]}||z  cxk    rk    n n||f0FS )r   )rU   ).0nr[   jra   r`   s       r)   	<setcomp>z0get_nemotron_vl_target_ratios.<locals>.<setcomp>   s       q!a%  q!a%	  a!e&&&&w&&&&&	 
A '&&&&r+   r   c                 $    | d         | d         z  S )Nr   r    xs    r)   r*   z/get_nemotron_vl_target_ratios.<locals>.<lambda>   s    qtad{ r+   )key)rU   sorted)r`   ra   r7   s   `` r)   get_nemotron_vl_target_ratiosrn      sX        w!,,  M -%:%:;;;;r+   c                    t          ||          }t          |          t          | |||          }t          j        fd|D                       }|S )Nr4   )r7   r:   rK   c                 &    g | ]} |          S ri   ri   )rd   rQ   	transforms     r)   
<listcomp>z5image_to_pixel_values_nemotron_vl.<locals>.<listcomp>   s#    EEEU		% 0 0EEEr+   )rn   r5   r_   torchstack)	rQ   r$   r`   ra   rK   r7   imagespixel_valuesrq   s	           @r)   !image_to_pixel_values_nemotron_vlrw      sp     2'7CCM:666I+##	  F ;EEEEfEEEFFLr+   c                      e Zd Zdddddededededz  dedz  dedz  d	dfd
Ze	d	efd            Z
deded	efdZ	 	 	 ddeej                 dedz  dedz  dedz  d	eej                 f
dZ	 	 	 ddee         deej                 dedz  dedz  dedz  d	eee         eeej        f         f         fdZdededz  d	ee         fdZdS )NemotronVLProcessorNmin_dynamic_patchmax_dynamic_patchdynamic_image_sizeconfig	tokenizerimage_processorr{   r|   r}   r;   c                   t          j        |            || _        || _        || _        |j        }|j        }|d}t          |t                    sJ || j        j	        }t          |t                    sJ |d}t          |t                    sJ t          ||z  dz  |j        dz  z            | _        || _        || _        || _        || _        | j        j        | _        d S )Nr   T   )r   __init__r~   r   r   force_image_size
patch_size
isinstanceintmax_num_tilesbooldownsample_rationum_image_tokenr:   r{   r|   r}   rK   )	selfr~   r   r   r{   r|   r}   r:   r   s	            r)   r   zNemotronVLProcessor.__init__   s
    	T". 1
 +
$ !+S11111$ $ 4 B+S11111%!%,d33333":%!+v/F/IJ 
  
 %!2!2"4#'#7#Er+   c                 J    | j                                         t                   S N)r   	get_vocabIMG_CONTEXTr   s    r)   image_token_idz"NemotronVLProcessor.image_token_id   s    ~''))+66r+   image_widthimage_heightc                    |                      d          }t          ||| j        || j                  \  }}}|| j        z  S )NF)rK   )rI   rJ   r:   r7   rK   )resolve_target_ratiosrP   r:   rK   r   )r   r   r   r7   num_patches_s         r)   get_num_image_tokensz(NemotronVLProcessor.get_num_image_tokens   sb     22 3 
 
 :"$',
 
 
Q T111r+   ru   c                 `                           |||d          \   fd|D             S )NF)r{   r|   r}   rK   c           	      L    g | ] }t          |j        j                   !S ))r$   r`   ra   rK   )rw   r:   rK   )rd   rQ   ra   r`   r   s     r)   rr   zCNemotronVLProcessor._images_to_pixel_values_lst.<locals>.<listcomp>  sN     	
 	
 	
  .?"0  	
 	
 	
r+   )resolve_min_max_num)r   ru   r{   r|   r}   ra   r`   s   `    @@r)   _images_to_pixel_values_lstz/NemotronVLProcessor._images_to_pixel_values_lst	  si      33//1	 4 
 
	
 	
 	
 	
 	
 	
  	
 	
 	
 		
r+   textc                    t          |          dk    ri }n|                     ||||          }t          j        |          t          j        d |D                       d}|D ]X}|j        d         }	|	| j        z  }
|                     |
|	          }|j        	                    dd          fd|D             }Yd |D             }||fS )	Nr   rz   c                 ,    g | ]}t          |          S ri   )rX   )rd   items     r)   rr   z9NemotronVLProcessor._preprocess_image.<locals>.<listcomp>6  s    <<<4SYY<<<r+   )pixel_values_flatimage_num_patchesr#   <NVL_IMG_CONTEXT>c                 >    g | ]}|                     d d          S )r#   r   )replace)rd   tNVL_IMAGE_CONTEXTs     r)   rr   z9NemotronVLProcessor._preprocess_image.<locals>.<listcomp>A  s*    QQQq		)->BBQQQr+   c                 D    g | ]}|                     d t                    S )r   )r   r   )rd   r   s     r)   rr   z9NemotronVLProcessor._preprocess_image.<locals>.<listcomp>B  s'    NNNAAII1;??NNNr+   )
rX   r   rs   cattensorshaper   get_image_replfullr   )r   r   ru   r{   r|   r}   image_inputspixel_values_lstrv   r   feature_size
image_replr   s               @r)   _preprocess_imagez%NemotronVLProcessor._preprocess_image"  s    v;;!LL#??"3"3#5	  @     &+Y/?%@%@%*\<<+;<<<& & L !1 R R*03*T-AA!00{KK
$.O$;$;2% %! RQQQDQQQNNNNND\!!r+   r   r   c                 n    t           |z  }t          |z   t          z   }t          j        |t                     S r   )r   	IMG_STARTIMG_ENDr   select_text)r   r   r   repl_features	repl_fulls        r)   r   z"NemotronVLProcessor.get_image_replE  s0    
 $l2-7	".y+FFFr+   )NNN)__name__
__module____qualname__r   r   r   r   r   r   propertyr   r   listr   rs   Tensorr   strtupledictr   r   r   ri   r+   r)   ry   ry      s4        )-(,*.$F $F $F $F !$F 0	$F :$F :$F !4K$F 
$F $F $F $FL 7 7 7 7 X72 2 	2
 
2 2 2 2. )-(,*.
 
U[!
 :
 :	

 !4K
 
el	
 
 
 
: )-(,*.!" !"3i!" U[!!" :	!"
 :!" !4K!" 
tCy$sEL011	2!" !" !" !"FGG 4ZG 
S	!	G G G G G Gr+   ry   c                   .    e Zd ZdZdedefdZdefdZdS )NemotronVLProcessingInfoz'Processing info for Nemotron VL models.kwargsr;   c                      | j         j        t          f|                                 |                                 |                                 d|S )N)r~   r   r   )ctxinit_processorry   get_hf_configget_tokenizerget_image_processorr   r   s     r)   get_hf_processorz)NemotronVLProcessingInfo.get_hf_processorS  s_    &tx&
%%''((** 4466	
 

 
 
 	
r+   c                 0    t          | j        j        fi |S r   )r   r   model_configr   s     r)   r   z,NemotronVLProcessingInfo.get_image_processor\  s)    1H!
 

 
 	
r+   N)r   r   r   __doc__objectry   r   r   ri   r+   r)   r   r   P  sX        11
 
4G 
 
 
 

F 
 
 
 
 
 
r+   r   )infodummy_inputsc                   d    e Zd Zededededz  fd            Zddded	eddf fd
Zde	de
fdZde	de
dz  d	efdZde	dej        fdZd,dZdej        dej        fdZdededz  fdZdedeej        df         fdZdedefdZdej        ddfdZdedefdZ	 d-ddddej        dedz  d ej        dz  d!edej        f
 fd"Z	 	 d.dej        d#ej        d$e dz  d%ej        dz  dede fd&Z!d'ej        dej        dz  fd(Z"d)e#eeej        f                  de$e         fd*Z%de&fd+Z' xZ(S )/LlamaNemotronVLChatModelmodalityr[   r;   Nc                 N    |                     d          rdS t          d          )NrQ   r#   z Only image modality is supported)
startswith
ValueError)clsr   r[   s      r)   get_placeholder_strz,LlamaNemotronVLChatModel.get_placeholder_stri  s,    w'' 	9;<<<r+    )prefixvllm_configr   c          	      f   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |                     ||           |j        p|j	        j
        }|j	        j        }|| _        t          ||z  dz  |j        dz  z            | _        |j        | _        |j        | _        |                     |d          5  |                     ||t%          |d                    | _        |                     |          | _        d d d            n# 1 swxY w Y   |                     |          5  t/          ||j        t%          |d                    | _        d d d            n# 1 swxY w Y   d | _        d | _        | j        j        | _        d S )Nr   rQ   vision_model)quant_configr   language_model)r   	hf_configr   )superr   r   r   r   multimodal_configr~   _patch_quant_configr   vision_configr:   r   r   r   r   
ps_version_mark_tower_model_init_vision_modelr"   r   
_init_mlp1mlp1_mark_language_modelr!   text_configr   img_context_token_idvisual_token_maskmake_empty_intermediate_tensors)	r   r   r   r~   r   r   r:   r   	__class__s	           r)   r   z!LlamaNemotronVLChatModel.__init__p  s7   )3"/'4F!2  666,O0D0O
)4
$":%!+v/F/IJ 
  
 !' 7 +##K99 	0 	0 $ 7 7)#FN;; !8 ! !D
 //DI	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 &&{33 	 	"<' ,#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 %)!!%? 	,,,s%   AD00D47D4+FFFr~   r   c                     t          |t                    r=|j        }t          |dd           }|j        s | |j                            d           d S d S d S d S )Nquantization_configr   )r   r   r   getattrmodules_to_not_convertrW   )r   r~   r   r   llm_quant_configs        r)   r   z,LlamaNemotronVLChatModel._patch_quant_config  s    
 lI.. 	K ,K&{4I4PP 7 K ,3::>JJJJJ	K 	KK K,,r+   c                8    t          j        |j        d          S )NT)trust_remote_code)r   from_configr   )r   r~   r   r   s       r)   r   z+LlamaNemotronVLChatModel._init_vision_model  s     $V%9TRRRRr+   c           
      j   |j         }|j        }|j        j        }t	          j        t	          j        |t          d| j        z            dz  z  d          t	          j	        |t          d| j        z            dz  z  |d          t	          j
                    t	          j	        ||                    S )Nr   r   T)bias)vit_hidden_sizeprojector_hidden_sizer   hidden_sizenn
Sequential	LayerNormr   r   LinearGELU)r   r~   r   vision_projection_hidden_sizellm_hidden_sizes        r)   r   z#LlamaNemotronVLChatModel._init_mlp1  s     0(.(D% ,8}L#a$*?&?"@"@A"EED   I#a$*?&?"@"@A"EE-  
 GIII3_EE
 
 	
r+         ?c           
         |                                 \  }}}}|                    ||t          ||z            t          ||z                      }|                    dddd                                          }|                    |t          ||z            t          ||z            t          |||z  z                      }| j        dk    rn*|                    dddd                                          }|S )Nr   r   r      v1)rS   viewr   permute
contiguousr   )r   rk   scale_factorre   whcs          r)   pixel_shufflez&LlamaNemotronVLChatModel.pixel_shuffle  s    VVXX
1aFF1aQ-..A4D0E0EFFIIaAq!!,,..FFL !!L !!\L0122	
 
 ?d""		!Q1%%0022Ar+   rv   c                    |                      |          j        }|                    t          j                  }t          |j        d         dz            x}}|                    |j        d         ||d          }|                     || j	                  }|                    |j        d         d|j        d                   }| 
                    |          }|S )Nrj   )dtyper   r  r   )r  )r   featurestors   bfloat16r   r   reshaper  r   r   )r   rv   
vit_embedsr  r  s        r)   extract_featurez(LlamaNemotronVLChatModel.extract_feature  s    &&&66?
]]]88
J$Q'3.///A''
(8(;Q2FF
''
AV'WW
''
(8(;RAQRTAUVV
YYz**
r+   r   c                    |                     dd           }|                     dd           }|                     dd           }||d S |t          d|          S |d         }t          |t          j                  r8|                                                                                                }t          |t                    sJ || _	        |*t          d||| j        j        | j        j        d          S t          d	          )
Nr   r   image_embeds)typedatar   rv   )r  r  )r  r   r   resolve_bindingsz This line should be unreachable.)popr   r   rs   r   flattenuniquer   r   r   r   r~   r   AssertionError)r   r   r   r   r  r   s         r)   _parse_and_validate_image_inputz8LlamaNemotronVLChatModel._parse_and_validate_image_input  s.    #JJ':DAA"JJ':DAAzz.$77$)=4#/#!   
   01nel33 	F+3355<<>>CCEEN.#.....$2!(+#"3-55" "	    ?@@@r+   image_input.c                    |d         dk    r|d         S |                      |d                   }|d         }t          |          dk    r&|                    d| j        j        j                  fS |j        d         |                    d| j        j        j                  }fd|D             }|                    |          S )	Nr  r  r  r   r   r   r  c                     g | ]}|z  S ri   ri   )rd   r   r   s     r)   rr   zALlamaNemotronVLChatModel._process_image_input.<locals>.<listcomp>  s+     
 
 
+6K,&
 
 
r+   )r  rX   r  r~   r   r   r   split)r   r&  r  r   image_feature_sizesr   s        @r)   _process_image_inputz-LlamaNemotronVLChatModel._process_image_input  s     v.00v&&++K8K,LMM!-0 {q   %%b$+*A*MNNPP $)!,#((T[-D-PQQ
 
 
 
:E
 
 
 !!"5666r+   c                 D    i }|D ]}|dv rd|vr | j         di ||d<   |S )N)r   r  ru   ri   )r%  )r   r   
modalities	input_keys       r)   %_parse_and_validate_multimodal_inputsz>LlamaNemotronVLChatModel._parse_and_validate_multimodal_inputs  sU    
   	V 	VIBBBJ..'Kt'K'U'Uf'U'U
8$r+   	input_idsc                     d | _         d S r   )r   )r   r0  s     r)   _set_visual_token_maskz/LlamaNemotronVLChatModel._set_visual_token_mask*  s    !%r+   c                      | j         di |}|sg S d}|D ]7}|dk    r/|d         }|                     |          }|t          |          z  }8|S )Nri   ru   )r/  r+  r   )r   r   r-  multimodal_embeddingsr   r&  image_embeddingss          r)   embed_multimodalz)LlamaNemotronVLChatModel.embed_multimodal-  s    ?T?II&II
 	I ;= # 	A 	AH8##(2#'#<#<[#I#I %/?)@)@@%$$r+   F)is_multimodalhandle_oov_mm_tokenr4  r7  r8  c                    |(t          |          dk    r|                     |           ||!t                                          |          S t                                          ||||          S )Nr   )r4  r7  r8  )rX   r2  r   embed_input_ids)r   r0  r4  r7  r8  r   s        r)   r:  z(LlamaNemotronVLChatModel.embed_input_ids@  s     !,5J1K1Ka1O1O''	222 !(M,A77**9555ww&&"7' 3	 ' 
 
 	
r+   	positionsintermediate_tensorsinputs_embedsc                     |d }||||d}| j         #|                    d| j         i           d | _          | j        j        di |}|S )N)r0  r;  r<  r=  r   ri   )r   updater   model)r   r0  r;  r<  r=  r   forward_kwargshidden_statess           r)   forwardz LlamaNemotronVLChatModel.forwardV  sz      + M #"$8*	
 
 !-!!#68N"OPPP%)D"1+1CCNCCr+   rB  c                 6    | j                             |          S r   )r   compute_logits)r   rB  s     r)   rE  z'LlamaNemotronVLChatModel.compute_logitsp  s     "11-@@@r+   weightsc                 V    ddg}t          | |          }|                    |          S )N	norm_meannorm_std)skip_substrs)r    load_weights)r   rF  rJ  loaders       r)   rK  z%LlamaNemotronVLChatModel.load_weightsv  s4     $Z0"4lCCC""7+++r+   c                 0    t          j        ddd          S )z<
        Get the module prefix in multimodal models
        r   r   r   )r   	connectortower_model)r   from_string_fieldr   s    r)   get_mm_mappingz'LlamaNemotronVLChatModel.get_mm_mapping}  s'     /+&
 
 
 	
r+   )r  r   )NN))r   r   r   classmethodr   r   r   r	   r   r   r
   r   r   r   Moduler   r  rs   r   r  r   r   r%  r   r+  r   r/  r2  r   r6  r   r:  r   rC  rE  r   setrK  r   rQ  __classcell__)r   s   @r)   r   r   c  s        =3 =3 =3: = = = [= BD (
 (
 (
z (
3 (
 (
 (
 (
 (
 (
 (
TK&K6HK K K KS S )4/S
 S S S S
!1 
bi 
 
 
 
$   $
EL 
U\ 
 
 
 
"A"A	t	#"A "A "A "AH7(7 
u|S 	!7 7 7 70f     & & & & & &% %4H % % % %, >B

 .2$)
 
 
<
  4d:

 |d*
 "
 

 
 
 
 
 
4 <@-1 < < 2D8	
 |d*  
   4A|A 
	A A A A,HU33D-E$F ,3s8 , , , ,
 
 
 
 
 
 
 
 
r+   r   )Jabcr   collections.abcr   rs   torch.nnr   torchvision.transforms
transformsr-   PILr   transformersr   r   (transformers.image_processing_utils_fastr   vllm.configr	   'vllm.model_executor.layers.quantizationr
   +vllm.model_executor.layers.quantization.awqr   #vllm.model_executor.models.internvlr   r   r   r   r   r   r   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.imager   vllm.multimodal.processingr   vllm.sequencer   vllm.tokenizersr   !vllm.transformers_utils.processorr   
interfacesr   r   r   r   utilsr    r!   r"   r   r   r   r   r5   r=   r   r   rH   r   rP   r_   rn   r   rw   ry   r   register_processorrS  r   ri   r+   r)   <module>rl     s         $ $ $ $ $ $        " " " " " "       4 4 4 4 4 4 4 4 K K K K K K " " " " " " F F F F F F A A A A A A                  E D D D D D / / / / / / 4 4 4 4 4 4 : : : : : : - - - - - - ) ) ) ) ) ) P P P P P P            O N N N N N N N N N	
	 	 	 	 	c3h( 	
   38_   6// / c3h(	/
 / / 3S=/ / / />&;& c3h(& 	&
 & 
%+& & & &R<<< 
%S/< < < <;  	
   \   .CG CG CG CG CG+ CG CG CGL
 
 
 
 
9 
 
 
& ('#$<=	!/0HI  
]
 ]
 ]
 ]
 ]
ry*<j, ]
 ]
 
]
 ]
 ]
r+   