
    .`i!                        d Z ddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZ ddlmZ ddlm Z! ddlm"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+  G d de&          Z, G d de          Z- G d dee,                   Z. G d de$          Z/ ej0        e.e,e/           G d de"                      Z1dS )zDInference-only OpenCUA-7B model compatible with HuggingFace weights.    )MappingSequence)AnyN)BatchFeature)Qwen2VLImageProcessorQwen2VLProcessorQwen2VLVideoProcessor)
VllmConfig)MULTIMODAL_REGISTRY)MultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseMultiModalProcessorPromptReplacementPromptUpdate)TokenizerLike   )Qwen2_5_VisionTransformer)"Qwen2_5_VLForConditionalGeneration)Qwen2VLDummyInputsBuilderQwen2VLMultiModalDataParserQwen2VLProcessingInfo_create_qwen2vl_field_factory)WeightsMapperinit_vllm_registered_modelmaybe_prefixc                   B    e Zd Zd Zdeeedz  f         fdZdefdZ	dS )OpenCUAProcessingInfoc                 4    | j                                         S N)ctxget_hf_configselfs    v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/opencua.pyr#   z#OpenCUAProcessingInfo.get_hf_config8   s    x%%'''    returnNc                 
    dd iS )Nimage r$   s    r&   get_supported_mm_limitsz-OpenCUAProcessingInfo.get_supported_mm_limits;   s    r'   kwargsc                 z    |                                  }| j                                        }t          d||d|S )zLoad OpenCUA processor.)vision_config	tokenizerr+   )get_tokenizerr"   get_hf_image_processor_configOpenCUAProcessor)r%   r-   r0   r/   s       r&   get_hf_processorz&OpenCUAProcessingInfo.get_hf_processor>   sR    &&((	>>@@ 
'
 
 
 
 	
r'   )
__name__
__module____qualname__r#   r   strintr,   objectr4   r+   r'   r&   r   r   7   sf        ( ( (cDj)A    
 
 
 
 
 
 
r'   r   c                   N     e Zd Zdededdf fdZdedef fdZ	 	 	 d
d	Z	 xZ
S )r3   attribute_nameargr(   Nc                 X    |dk    rd S t                                          ||          S )Nr0   )supercheck_argument_for_proper_class)r%   r<   r=   	__class__s      r&   r@   z0OpenCUAProcessor.check_argument_for_proper_classJ   s-    [((Fww66~sKKKr'   r/   r0   c                     t          di |}t          di |}|                    dd           } t                      j        d||||d| d| _        d S )Nchat_template)image_processorr0   video_processorrC   <|media_placeholder|>r+   )r   r	   popr?   __init__image_token)r%   r/   r0   r-   rD   rE   rC   rA   s          r&   rH   zOpenCUAProcessor.__init__O   s     0@@-@@/@@-@@

?D99 	
++'		
 	

 	
 	
 	
 3r'   c                    |'t          |t                    s|g} | j        |fi |}ni }i }|Dt          |t                    s|g}t          |          dk    r|                     ||pd          }i ||}t          ||          S )Nr   pt)return_tensors)tensor_type)
isinstancelistr0   lenrD   r   )r%   textimagesrL   r-   text_inputsimage_inputscombined_inputss           r&   __call__zOpenCUAProcessor.__call__c   s     dD)) v($.8888KKKfd++ " 6{{Q#33>+AT  4     :[9L9OHHHHr'   )NNN)r5   r6   r7   r8   r:   r@   dictr   rH   rV   __classcell__)rA   s   @r&   r3   r3   I   s        Lc L LSW L L L L L L
33 !3 3 3 3 3 3, 	I I I I I I I Ir'   r3   c            
           e Zd ZdefdZdedeeef         deee	f         fdZ
dededeeef         deeef         def
d	Zdedeeef         d
edee         fdZdS )OpenCUAMultiModalProcessorr(   c                 b    t          | j                                        j        j                  S r!   )r   infor#   r/   spatial_merge_sizer$   s    r&   _get_data_parserz+OpenCUAMultiModalProcessor._get_data_parser   s+    *I##%%3F
 
 	
r'   	hf_inputshf_processor_mm_kwargsc                 t     t          | j                                        j        j                  |          S r!   )r   r\   r#   r/   r]   )r%   r_   r`   s      r&   _get_mm_fields_configz0OpenCUAMultiModalProcessor._get_mm_fields_config   s=    

,I##%%3F
 

  	r'   prompt_textmm_itemstokenization_kwargsc                     dS )u<   vLLM이 prompt 업데이트를 처리하도록 False 반환.Fr+   )r%   rc   rd   r`   re   s        r&   _hf_processor_applies_updatesz8OpenCUAMultiModalProcessor._hf_processor_applies_updates   s	     ur'   out_mm_kwargsc                     | j         j        d
i |} | j         j        d
i |}| j                                         }|                                }| j                                         }t          |dd          }	|                    |	t          |dd                    |j        dz  dt          ffd}
t          dg|
	          gS )NrI   rF   media_placeholder_token_idipP    item_idxc                     d         |          }|d         j         }t          |t          j                  sJ t	          |                                          z  }g|z  S )Nr*   image_grid_thw)datarN   torchTensorr9   prod)rl   out_itemgrid_thw
num_tokensimage_token_idmerge_lengthrh   s       r&   get_replacement_opencuazOOpenCUAMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_opencua   s`    $W-h7H 016Hh55555X]]__--=J"#j00r'   r*   )modalitytargetreplacementr+   )r\   r4   get_image_processorr1   	get_vocabr#   getattrget
merge_sizer9   r   )r%   rd   r`   rh   hf_processorrD   r0   vocab	hf_configimage_token_strrx   rv   rw   s      `       @@r&   _get_prompt_updatesz.OpenCUAMultiModalProcessor._get_prompt_updates   s    2ty1KK4JKK7$)7QQ:PQQI++--	##%%I++--	!,?VWWI;VDD
 

 '114	1c 	1 	1 	1 	1 	1 	1 	1 	1  &'3  
 	
r'   N)r5   r6   r7   r   r^   r   r   r8   r:   r   rb   r   boolrg   r   r   r   r   r   r+   r'   r&   rZ   rZ      s       
"6 
 
 
 

 !(V 4 
++	,	    & !(V 4	
 %S&[1 
   "
%"
 !(S 1"
 -	"

 
,	"
 "
 "
 "
 "
 "
r'   rZ   c                   .    e Zd Zdeeef         defdZdS )OpenCUADummyInputsBuilder	mm_countsr(   c                 <    |                     dd          }d}||z  S )Nr*   r   rF   )r   )r%   r   
num_imagesrI   s       r&   get_dummy_textz(OpenCUADummyInputsBuilder.get_dummy_text   s%    ]]7A..
-Z''r'   N)r5   r6   r7   r   r8   r9   r   r+   r'   r&   r   r      s?        (S(9 (c ( ( ( ( ( (r'   r   )r\   dummy_inputsc                       e Zd Zg dddgdZ edddddd	          Zd
Zedede	dedz  fd            Z
dddedefdZdS )OpenCUAForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvisual.zlanguage_model.lm_head.)zmodel.language_model.zmodel.visual.zvision_tower.zlm_head.zmodel.)orig_to_new_prefixTry   ir(   Nc                 N    |                     d          rdS t          d          )Nr*   rF   z Only image modality is supported)
startswith
ValueError)clsry   r   s      r&   get_placeholder_strz3OpenCUAForConditionalGeneration.get_placeholder_str   s-    w'' 	+**;<<<r'    )prefixvllm_configr   c          
         t           j                            |            |j        j        }|j        }|j        j        }|j        dk    | _        || _	        || _
        || _        || _        |                                | _        |                     |d          5  t          |j        t          |dd          | j        t!          |d                    | _        d d d            n# 1 swxY w Y   |                     |          5  t'          ||j        t!          |d          dg	          | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )
Nro   r*   rms_norm_epsgư>visual)r/   norm_epsquant_configr   language_modelQwen2ForCausalLM)r   r   r   architectures)nnModulerH   model_configr   r   multimodal_configmm_encoder_tp_modeuse_data_parallelconfigr   is_multimodal_pruning_enabled_mark_tower_modelOpenCUAVisionTransformerr/   r~   r   r   _mark_language_modelr   text_configr   make_empty_intermediate_tensors)r%   r   r   r   r   r   s         r&   rH   z(OpenCUAForConditionalGeneration.__init__   s   
	4   )3"/'4F!2!E!O&!2(;;== 	* ##K99 	 	2$2 >>!.#FH55	  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"<' ,#F,<==12	# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s%   A C&&C*-C*-D??EE)r5   r6   r7   packed_modules_mappingr   hf_to_vllm_mappersupports_encoder_tp_dataclassmethodr8   r9   r   r
   rH   r+   r'   r&   r   r      s         322$i0 
 &%<&&1-
 
    $=3 =3 =3: = = = [=
 BD !
 !
 !
z !
3 !
 !
 !
 !
 !
 !
r'   r   )2__doc__collections.abcr   r   typingr   rp   torch.nnr   transformersr   transformers.models.qwen2_vlr   r   r	   vllm.configr
   vllm.multimodalr   vllm.multimodal.inputsr   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   vllm.tokenizersr   
qwen2_5_vlr   r   r   qwen2_vlr   r   r   r   utilsr   r   r   r   r3   rZ   r   register_processorr   r+   r'   r&   <module>r      s   K J - - - - - - - -              % % % % % %          # " " " " " / / / / / /        L K K K K K K K         
 * ) ) ) ) )                              
 
 
 
 
1 
 
 
$3I 3I 3I 3I 3I' 3I 3I 3Il;
 ;
 ;
 ;
 ;
!89N!O ;
 ;
 ;
|( ( ( ( ( 9 ( ( ( ('	*  
9
 9
 9
 9
 9
&H 9
 9
 
9
 9
 9
r'   