
    .`i                     F   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ dd	lmZmZmZ dd
lmZ ddlmZ  G d de          Z G d dee                   Z G d dej                  Z ej        eee           G d de                      ZdS )    )MappingN)GELUActivation)
VllmConfig)BaseDummyOptions)MULTIMODAL_REGISTRY)MultiModalDataDict   )LlavaDummyInputsBuilderLlavaNextMultiModalProcessorLlavaNextProcessingInfo)&LlavaOnevisionForConditionalGeneration)WeightsMapperc                        e Zd Zd ZdefdZdS )RVLProcessingInfoc                 4    | j                                         S N)ctxget_hf_config)selfs    r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/rvl.pyr   zRVLProcessingInfo.get_hf_config   s    x%%'''    kwargsc                 &     | j         j        di |S )N )r   get_hf_processor)r   r   s     r   r   z"RVLProcessingInfo.get_hf_processor   s    (tx(226222r   N)__name__
__module____qualname__r   objectr   r   r   r   r   r      s>        ( ( (3 3 3 3 3 3 3r   r   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	RVLDummyInputsBuilder	mm_countsreturnc                 <    |                     dd          }d}||z  S )Nimager   z<image>)get)r   r"   
num_imagesimage_tokens       r   get_dummy_textz$RVLDummyInputsBuilder.get_dummy_text!   s%    ]]7A..
Z''r   Nseq_len
mm_optionsc                     |                     dd          }| j                                        \  }}|r|                     d          nd }d|                     ||||          iS )Nr%   r   )widthheightr'   	overrides)r&   info!get_image_size_with_most_features_get_dummy_images)r   r*   r"   r+   r'   target_widthtarget_heightimage_overridess           r   get_dummy_mm_dataz'RVLDummyInputsBuilder.get_dummy_mm_data'   s|     ]]7A..
&*i&Q&Q&S&S#m5?I*..111T T++"$%)	 ,  
 	
r   r   )
r   r   r   r   strintr)   r   r   r6   r   r   r   r!   r!       s        (S(9 (c ( ( ( ( =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r   r!   c                   B     e Zd Z fdZdej        dej        fdZ xZS )RVLMultiModalProjectorc                 x   t                                                       t          j        |j        j        d          | _        t          j        |j        j        |j        j        d          | _	        t                      | _        t          j        |j        j        |j        j        d          | _        d S )Ngư>)epsT)bias)super__init__nn	LayerNormvision_confighidden_sizepre_normLineartext_configlinear_1r   actlinear_2)r   config	__class__s     r   r?   zRVLMultiModalProjector.__init__>   s    V%9%E5QQQ	 ,*
 
 

 "##	**
 
 
r   image_featurer#   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r   )rD   rG   rH   rI   )r   rL   hidden_statess      r   forwardzRVLMultiModalProjector.forwardM   sL    m44m44//m44r   )r   r   r   r?   torchTensorrO   __classcell__rK   s   @r   r:   r:   =   s^        
 
 
 
 
U\ el        r   r:   )r0   dummy_inputsc                   V     e Zd Z edddddd          Zdd	d
ededdf fdZ xZS )RForConditionalGenerationzlanguage_model.model.zvision_tower.zmulti_modal_projector.image_newlinezlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zmodel.image_newlinezlm_head.)orig_to_new_prefix )prefixvllm_configrZ   r#   Nc                    t                                          ||           |j        j        }t	          |          | _        d S )N)r[   rZ   )r>   r?   model_config	hf_configr:   multi_modal_projector)r   r[   rZ   rJ   rK   s       r   r?   z"RForConditionalGeneration.__init__h   sA    [@@@)3%;F%C%C"""r   )	r   r   r   r   hf_to_vllm_mapperr   r7   r?   rR   rS   s   @r   rV   rV   V   s         & &=#2,D#21
 

 
 
 BD D D Dz D3 D D D D D D D D D D Dr   rV   )collections.abcr   rP   torch.nnr@   transformers.activationsr   vllm.configr   vllm.config.multimodalr   vllm.multimodalr   vllm.multimodal.inputsr   
llava_nextr
   r   r   llava_onevisionr   utilsr   r   r!   Moduler:   register_processorrV   r   r   r   <module>rm      s   $ # # # # #        3 3 3 3 3 3 " " " " " " 3 3 3 3 3 3 / / / / / / 5 5 5 5 5 5         
 D C C C C C            3 3 3 3 3/ 3 3 3
 
 
 
 
34EF 
 
 
:    RY   2 (' 	&  
D D D D D F D D 
D D Dr   