
    .`i                        d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZmZmZ d
dlmZ d
dlmZmZmZmZm Z  dZ! G d de          Z" G d de          Z# G d dee#                   Z$ G d dee#                   Z% ej&        e%e#e$           G d de                       Z'dS )    )MappingSequenceN)PretrainedConfig)BaseDummyOptions)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalKwargsItems)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)PromptReplacementPromptUpdatePromptUpdateDetails   )InternVisionModel)BaseInternVLDummyInputsBuilderBaseInternVLMultiModalProcessorBaseInternVLProcessingInfoBaseInternVLProcessorInternVLChatModelz<|vision_pad|>c                   P    e Zd Zedefd            Zdededz  dee         fdZdS )NVLMProcessorreturnc                 J    | j                                         t                   S N)	tokenizer	get_vocabIMG_PAD)selfs    u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/nvlm_d.pyimage_token_idzNVLMProcessor.image_token_id,   s    ~''))'22    feature_sizenum_patchesNc                    |t          d          d t          d|          D             }| j        r|dgz  }||z  d                    fd|D                       }d|z   dz   }t	          j        |t                    S )	Nz"Embedding inputs are not supportedc                     g | ]}d | d	S )z<tile_> ).0is     r!   
<listcomp>z0NVLMProcessor.get_image_repl.<locals>.<listcomp>8   s     MMM!MMMr#   r   z<tile_global_thumbnail> c              3   2   K   | ]}|t           z  z   V  d S r   )r   )r*   
identifiercontext_sizes     r!   	<genexpr>z/NVLMProcessor.get_image_repl.<locals>.<genexpr>=   s>       
 
4>J<//
 
 
 
 
 
r#   z<Image>z</Image>)NotImplementedErrorrangeuse_thumbnailjoinr   select_textr   )r    r$   r%   tile_pos_identifiersfeaturesreplr0   s         @r!   get_image_replzNVLMProcessor.get_image_repl0   s    
 %&JKKKMMuQ7L7LMMM 	@ %>$?? #{277 
 
 
 
BV
 
 
 
 
 8#j0".tW===r#   )	__name__
__module____qualname__propertyintr"   r   strr:   r)   r#   r!   r   r   +   sr        3 3 3 3 X3>> 4Z> 
S	!	> > > > > >r#   r   c                       e Zd ZdedefdZdS )NVLMProcessingInfokwargsr   c                      | j         j        t          f|                                 |                                 d|S )N)configr   )ctxinit_processorr   get_hf_configget_tokenizer)r    rC   s     r!   get_hf_processorz#NVLMProcessingInfo.get_hf_processorJ   sP    &tx&
%%''((**
 
 	
 
 	
r#   N)r;   r<   r=   objectr   rJ   r)   r#   r!   rB   rB   I   s6        
 
M 
 
 
 
 
 
r#   rB   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	NVLMDummyInputsBuilder	mm_countsr   c                 8    |                     dd          }d|z  S )Nimager   <image>
)get)r    rN   
num_imagess      r!   get_dummy_textz%NVLMDummyInputsBuilder.get_dummy_textT   s"    ]]7A..
 Z''r#   Nseq_len
mm_optionsc                     | j                                         \  }}|                    dd          }|r|                    d          nd }d|                     ||||          iS )NrP   r   )widthheightrS   	overrides)info!get_image_size_with_most_featuresrR   _get_dummy_images)r    rU   rN   rV   target_widthtarget_heightrS   image_overridess           r!   get_dummy_mm_dataz(NVLMDummyInputsBuilder.get_dummy_mm_data[   s|     '+i&Q&Q&S&S#m]]7A..
5?I*..111T T++"$%)	 ,  
 	
r#   r   )
r;   r<   r=   r   r@   r?   rT   r   r	   ra   r)   r#   r!   rM   rM   S   s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r#   rM   c            	       B    e Zd Zdedeeef         dedee	         fdZ
dS )NVLMMultiModalProcessormm_itemshf_processor_mm_kwargsout_mm_kwargsr   c                 V      j         j        di ||                                }d|v r9|d         t          t          j                  sJ                                 n d|v rd gt          |d                   z  ng dt          f fd}t          dd|          gS )	Nimage_num_patchesimage_embedsitem_idxc                                         dt          t          f          }t          |t                    r|                    |           }n<|                    |           }	j                            |j        |j	                  }|          }|t          |t                    sJ                     ||          }t          j        |j        dz   t                    S )NrP   )image_widthimage_height	processor
)	get_itemsr   r   
isinstanceget_feature_sizeget_image_sizer[   get_num_image_tokensrX   rY   r?   r:   r   r6   fullr   )
rj   imagesr$   
image_sizer%   r9   hf_processorrh   rd   r    s
         r!   get_replacement_nvlmzINVLMMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_nvlm   s    ''-/BC F &"566 %66x@@#228<<
#y== * 0!+!2*  >     ,H5K&!+s33333..|[IID&249t3CWMMMr#   rP   rQ   )modalitytargetreplacementr)   )
r[   rJ   get_datarq   torchTensortolistlenr?   r   )r    rd   re   rf   out_mm_datary   rx   rh   s   ``    @@r!   _get_prompt_updatesz+NVLMMultiModalProcessor._get_prompt_updatesq   s    2ty1KK4JKK#,,..+-- +,? @/>>>>> 1 8 8 : :{** "&[-H)I)I I "	N3 	N 	N 	N 	N 	N 	N 	N 	N 	N2  "0  
 	
r#   N)r;   r<   r=   r   r   r@   rK   r
   r   r   r   r)   r#   r!   rc   rc   p   s\        2
%2
 !(V 42
 -	2

 
,	2
 2
 2
 2
 2
 2
r#   rc   )r[   dummy_inputsc                   F    e Zd Zdedej        fdZdededz  dede	fdZ
dS )	NVLM_D_ModelrE   r   c           
      ~   |j         j        }|j        j        }|j        j        }t	          j        t	          j        |t          d| j        z            dz  z            t	          j	        |t          d| j        z            dz  z  |d          t	          j
                    t	          j	        ||d                    S )Nr      F)bias)vision_confighidden_sizetext_configintermediate_sizenn
Sequential	LayerNormr?   downsample_ratioLinearGELU)r    rE   vit_hidden_sizellm_intermediate_sizellm_hidden_sizes        r!   
_init_mlp1zNVLM_D_Model._init_mlp1   s     .: & 2 D ,8}L3q43H/H+I+IQ+NNOOI#a$*?&?"@"@A"EE%  
 GIII+_5III	
 	
 		
r#   quant_configNis_monoprefixc                    |s>|j         }|dk     r|j        j        |z   dz   }n|dz   }t          |j        ||d|          S d}t	          |          )Nr   r      )r   num_hidden_layers_overridenum_dummy_headsr   z)Monolith mode is not applicable to NVLM_D)select_layerr   num_hidden_layersr   r2   )r    rE   r   r   r   vision_feature_layerr   msgs           r!   _init_vision_modelzNVLM_D_Model._init_vision_model   s      	+#)#6 #a''(:=QQTUU "! %91$<! %$)+< !    >C%c***r#   )r;   r<   r=   r   r   Moduler   r   boolr@   r   r)   r#   r!   r   r      sx        
!1 
bi 
 
 
 
 + + )4/+
 + + + + + + +r#   r   )(collections.abcr   r   r~   torch.nnr   transformersr   vllm.config.multimodalr   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr	   r
   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   
intern_vitr   internvlr   r   r   r   r   r   r   rB   rM   rc   register_processorr   r)   r#   r!   <module>r      sz   . - - - - - - -        ) ) ) ) ) ) 3 3 3 3 3 3 F F F F F F / / / / / / L L L L L L L L         
          * ) ) ) ) )              > > > > >) > > ><
 
 
 
 
3 
 
 

 
 
 
 
;<NO 
 
 
:3
 3
 3
 3
 3
=>PQ 3
 3
 3
l ('	'  
-+ -+ -+ -+ -+$ -+ -+ 
-+ -+ -+r#   