
    .`i6              	          U d dl mZmZmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dlmZmZmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;  ee<          Z= G d de*          Z> G d de*          Z?e>e?z  Z@eeAd<    G d de	jB                  ZC G d de"          ZD G d de eD                   ZE G d  d!e!eD                   ZF ejG        eFeDeE"           G d# d$e	jB        e.e/e0                      ZHdS )%    )IterableMappingSequence)	AnnotatedLiteral	TypeAliasN)nn)BatchFeaturePaliGemmaConfig)
VllmConfig)BaseDummyOptions)init_logger)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalInputsMultiModalKwargsItemsMultiModalUUIDDict)ImageEmbeddingItemsImageProcessorItemsMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptIndexTargetsPromptInsertionPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPP)MultiModelKeys)SiglipVisionModel)AutoWeightsLoaderWeightsMapperinit_vllm_registered_modelmaybe_prefix)get_vision_encoder_infoc                   j    e Zd ZU dZdZed         ed<   eej	         e
dddd          f         ed<   d	S )
PaliGemmaImagePixelInputsz
    Dimensions:
        - bn: Batch size * number of images
        - c: Number of channels (3)
        - h: Height
        - w: Width
    pixel_valuestypebn   hwdataN__name__
__module____qualname____doc__r1   r   __annotations__r   torchTensorr!        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/paligemma.pyr/   r/   9   s[           %3D'.
!222
EL++dAsC"@"@@
AAAAAAr@   r/   c                   h    e Zd ZU dZdZed         ed<   eej	         e
ddd          f         ed<   dS )	PaliGemmaImageEmbeddingInputsz
    Dimensions:
        - bn: Batch size * number of images
        - ifs: Image feature size
        - hs: Hidden size (must match language model backbone)
    image_embedsr1   r2   ifshsr6   Nr7   r?   r@   rA   rC   rC   F   sY           %3D'.
!222
EL++dE4"@"@@
AAAAAAr@   rC   PaliGemmaImageInputsc                   L     e Zd Zdedef fdZdej        dej        fdZ xZS )PaliGemmaMultiModalProjectorvision_hidden_sizeprojection_dimc                     t                                                       t          j        ||d          | _        d S )NT)bias)super__init__r	   Linearlinear)selfrJ   rK   	__class__s      rA   rO   z%PaliGemmaMultiModalProjector.__init__X   s6    i 2NNNNr@   image_featuresreturnc                 0    |                      |          }|S N)rQ   )rR   rT   hidden_statess      rA   forwardz$PaliGemmaMultiModalProjector.forward]   s    N33r@   )	r8   r9   r:   intrO   r=   r>   rY   __classcell__rS   s   @rA   rI   rI   W   sy        O3 O O O O O O O
el u|        r@   rI   c                   P    e Zd Zd Zd Zdeeedz  f         fdZdededefdZ	dS )	PaliGemmaProcessingInfoc                 @    | j                             t                    S rW   )ctxget_hf_configr   rR   s    rA   ra   z%PaliGemmaProcessingInfo.get_hf_configc   s    x%%o666r@   c                 D    t          |                                           S rW   )r-   ra   rb   s    rA   r-   z/PaliGemmaProcessingInfo.get_vision_encoder_infof   s    &t'9'9';';<<<r@   rU   Nc                 
    ddiS )Nimager"   r?   rb   s    rA   get_supported_mm_limitsz/PaliGemmaProcessingInfo.get_supported_mm_limitsi   s    |r@   image_widthimage_heightc                X    |                                  }|                    ||          S )Nrg   rh   )r-   get_num_image_tokens)rR   rg   rh   vision_encoder_infos       rA   rk   z,PaliGemmaProcessingInfo.get_num_image_tokensl   s9     #::<<"77#% 8 
 
 	
r@   )
r8   r9   r:   ra   r-   r   strrZ   rf   rk   r?   r@   rA   r^   r^   b   s        7 7 7= = =cDj)A    
 
 	

 

 
 
 
 
 
r@   r^   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	PaliGemmaDummyInputsBuilder	mm_countsrU   c                     dS )N r?   )rR   rp   s     rA   get_dummy_textz*PaliGemmaDummyInputsBuilder.get_dummy_text{   s    rr@   Nseq_len
mm_optionsc                     | j                                         }|j        }|j        }|                    dd          }|r|                    d          nd }d|                     ||||          iS )Nre   r   )widthheight
num_images	overrides)infora   vision_config
image_sizeget_get_dummy_images)	rR   rt   rp   ru   	hf_configr|   max_image_sizery   image_overridess	            rA   get_dummy_mm_dataz-PaliGemmaDummyInputsBuilder.get_dummy_mm_data~   s     I++--	!/&1]]7A..
5?I*..111T T++$%%)	 ,  
 	
r@   rW   )
r8   r9   r:   r   rm   rZ   rs   r   r   r   r?   r@   rA   ro   ro   z   s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r@   ro   c                   4    e Zd Zdedeeef         deeef         deeef         def
 fdZdedeeef         deeef         fd	Z	d
e
deeef         dedee         fdZ	 	 ddeee         z  dedeeef         deeef         dz  dedz  def fdZ xZS )PaliGemmaMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrU   c                     | j                                         }|s7|                    |d          }t          t	          |g          d          S t                                          ||||          S )NF)add_special_tokens)	input_idspt)tensor_type)r   r   r   r   )r{   get_tokenizerencoder
   dictrN   _call_hf_processor)rR   r   r   r   r   	tokenizer
prompt_idsrS   s          rA   r   z/PaliGemmaMultiModalProcessor._call_hf_processor   s     I++--	 	P"))&U)KKJ
| < < <$OOOOww))!	 * 
 
 	
r@   	hf_inputshf_processor_mm_kwargsc                 F    t          t          j        d                    S )Nre   )r0   )r   r   batched)rR   r   r   s      rA   _get_mm_fields_configz2PaliGemmaMultiModalProcessor._get_mm_fields_config   s!    
 !6!>w!G!GHHHHr@   mm_itemsout_mm_kwargsc                 6     j                                         }|j         j                                         }|j        t          t                    sJ dt          f fd}t          dt          j	        |j
        rgng           |          gS )Nitem_idxc                 T                        dt          t          f          }t          |t                    r|                    |           }n;|                    |           }j                            |j        |j	                  }g|z  }t          j        |gz             S )Nre   rj   )embed_token_id)	get_itemsr   r   
isinstanceget_feature_sizeget_image_sizer{   rk   rw   rx   r   select_token_id)	r   imagesnum_image_tokensr}   image_tokensbos_token_idimage_token_idr   rR   s	        rA   get_insertionzGPaliGemmaMultiModalProcessor._get_prompt_updates.<locals>.get_insertion   s    ''-/BC F &"566 #)#:#:8#D#D  #228<<
#'9#A#A * 0!+!2 $B $ $ 
 ++.>>L&6~--   r@   re   )modalitytarget	insertion)r{   ra   image_token_indexr   r   r   rZ   r   r   prefixadd_bos_token)	rR   r   r   r   r   r   r   r   r   s	   ``     @@rA   _get_prompt_updatesz0PaliGemmaMultiModalProcessor._get_prompt_updates   s     I++--	"4I++--	 -,,,,,,	C 	 	 	 	 	 	 	 	 	2  )0&/&=E\NN2  (  
 	
r@   Ntokenization_kwargsmm_uuidsc                 :   t                                          |||||          }|d         }| j                                        }d}	|                    |	          d         }
t          |          r&|d         |
k    r|                    |
           ||d<   |S )N)r   prompt_token_ids
)rN   applyr{   r   r   lenappend)rR   r   r   r   r   r   	mm_inputsr   r   newline_promptnewline_token_idrS   s              rA   r   z"PaliGemmaMultiModalProcessor.apply   s     GGMM" " 
 
	 %%78I++--	$++N;;B?    	=%5b%9=M%M%M##$4555,<I()r@   NN)r8   r9   r:   rm   r   objectr
   r   r   r   r   r   r   r   r   listrZ   r   r   r   r   r[   r\   s   @rA   r   r      s       

 f%
 3;'	

 CK(
 

 
 
 
 
 
&II !(V 4I 
++	,	I I I I.
%.
 !(V 4.
 -	.

 
,	.
 .
 .
 .
j <@.2 d3i $ !(V 4	
 %S&[1D8 %t+ 
         r@   r   )r{   dummy_inputsc                       e Zd Zg dddgdZ eddddd	
          Zededededz  fd            Z	ddde
def fdZdededz  fdZdedej        dej        fdZdedej        fdZdedefdZ	 	 d+dej        dej        dedz  d ej        dz  dedefd!Zd"ej        dej        dz  fd#Zd$eeeej        f                  dee         fd%Zdefd&Zd'edefd(Zd)edefd*Z  xZ!S ),!PaliGemmaForConditionalGeneration)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projzlanguage_model.model.zvision_tower.zmulti_modal_projector.zlanguage_model.lm_head.)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.zlm_head.)orig_to_new_prefixr   irU   Nc                 N    |                     d          rd S t          d          )Nre   z Only image modality is supported)
startswith
ValueError)clsr   r   s      rA   get_placeholder_strz5PaliGemmaForConditionalGeneration.get_placeholder_str  s,    w'' 	4;<<<r@   rr   r   vllm_configr   c          	      0   t                                                       |j        j        }|j        }|j        j        }|| _        || _        || _        |                     |d          5  t          |j	        |t          |d                    | _        t          |j	        j        |j	        j                  | _        d d d            n# 1 swxY w Y   |j        j        dk    rdg|j        _        ndg|j        _        |                     |          5  t)          ||j        t          |d          	          | _        t-          |d
d          }| j        j        xj        |z  c_        d d d            n# 1 swxY w Y   | j        j        | _        d S )Nre   vision_towerr   )rJ   rK   gemmaGemmaForCausalLMGemma2ForCausalLMlanguage_model)r   r   r   logit_scaleg      ?)rN   rO   model_configr   quant_configmultimodal_configconfig_mark_tower_modelr(   r|   r,   r   rI   hidden_sizerK   multi_modal_projectortext_config
model_typearchitectures_mark_language_modelr+   r   getattrlogits_processorscalemake_empty_intermediate_tensors)rR   r   r   r   r   r   r   rS   s          rA   rO   z*PaliGemmaForConditionalGeneration.__init__#  s3   )3"/'4F!2(##K99 		 		 1$#FN;;! ! !D
 *F#)#7#C%3B* * *D&		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 (G330B/CF,,0C/DF,&&{33 	F 	F"<' ,#F,<==# # #D "&-==K066+E66	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F ? 	,,,s&   ,ACCCAE::E>E>kwargsc                     |                     dd           }|                     dd           }||d S |(| j        j        j        x}}t	          d|||d          S |t          d|          S t          d          )Nr0   rD   )r4   r5   )r1   r6   resolve_bindings)r1   r6   z This line should be unreachable.)popr   r|   r}   r/   rC   AssertionError)rR   r   r0   rD   r4   r5   s         rA   _parse_and_validate_image_inputzAPaliGemmaForConditionalGeneration._parse_and_validate_image_inputJ  s     zz.$77zz.$77L$84#K-88A,#!'(q!1!1    #0#!   
 ?@@@r@   r   r0   c                     |                                 j        j        } ||                    |                    }|S )N)dtype)get_input_embeddingsweightr   to)rR   r   r0   target_dtyperT   s        rA   _image_pixels_to_featuresz;PaliGemmaForConditionalGeneration._image_pixels_to_featuresd  s>    
 $88::AG%looLo&I&IJJr@   image_inputc                     |d         dk    r|d         S |d         }|                      | j        |          }|                     |          S )Nr1   rD   r6   )r   r   r   )rR   r   r0   rT   s       rA   _process_image_inputz6PaliGemmaForConditionalGeneration._process_image_inputn  s]     v.00v&&"6*77
 

 )).999r@   c                 v     | j         di |}|g S |                     |          }|| j        j        dz  z  }|S )Ng      r?   )r   r   r   r   )rR   r   r   vision_embeddingss       rA   embed_multimodalz2PaliGemmaForConditionalGeneration.embed_multimodal}  sT    :d:DDVDDI 55kBB-1H$1NO  r@   r   	positionsintermediate_tensorsinputs_embedsc                 J    |d }| j                             ||||          }|S )N)r   )r   model)rR   r   r   r   r   r   rX   s          rA   rY   z)PaliGemmaForConditionalGeneration.forward  s>      + M+11y"6m 2 
 
 r@   rX   c                 6    | j                             |          S rW   )r   compute_logits)rR   rX   s     rA   r  z0PaliGemmaForConditionalGeneration.compute_logits  s     "11-@@@r@   weightsc                 X    t          |           }|                    || j                  S )N)mapper)r)   load_weightshf_to_vllm_mapper)rR   r  loaders      rA   r  z.PaliGemmaForConditionalGeneration.load_weights  s+    "4((""743I"JJJr@   c                 0    t          j        ddd          S )Nr   r   r   )r   	connectortower_model)r'   from_string_fieldrb   s    rA   get_mm_mappingz0PaliGemmaForConditionalGeneration.get_mm_mapping  s%    /+-&
 
 
 	
r@   r   c                     |S rW   r?   )rR   r   s     rA   get_num_mm_encoder_tokensz;PaliGemmaForConditionalGeneration.get_num_mm_encoder_tokens  s    r@   num_vision_tokensc                     |S rW   r?   )rR   r  s     rA   get_num_mm_connector_tokensz=PaliGemmaForConditionalGeneration.get_num_mm_connector_tokens  s      r@   r   )"r8   r9   r:   packed_modules_mappingr*   r  classmethodrm   rZ   r   r   rO   r   rG   r   r(   r=   r>   r   r   r#   r   r   rY   r  r   tuplesetr  r'   r  r  r  r[   r\   s   @rA   r   r      s       
 
 
 

 
 & &=#2,D1
 
   =3 =3 =3: = = = [= BD %
 %
 %
z %
3 %
 %
 %
 %
 %
 %
NAA		$A A A A4' l 
	   :): 
: : : :! !4H ! ! ! ! <@-1 < < 2D8	
 |d*  
   "A|A 
	A A A AKHU33D-E$F K3s8 K K K K
 
 
 
 
 #  #        !S !S ! ! ! ! ! ! ! !r@   r   )Icollections.abcr   r   r   typingr   r   r   r=   r	   transformersr
   r   vllm.configr   vllm.config.multimodalr   vllm.loggerr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar    r!   
interfacesr#   r$   r%   r&   module_mappingr'   siglipr(   utilsr)   r*   r+   r,   visionr-   r8   loggerr/   rC   rG   r<   ModulerI   r^   ro   r   register_processorr   r?   r@   rA   <module>r,     s<   8 7 7 7 7 7 7 7 7 7 7 0 0 0 0 0 0 0 0 0 0        6 6 6 6 6 6 6 6 " " " " " " 3 3 3 3 3 3 # # # # # # / / / / / /                      
                  . - - - - - > > > > > > > >            + * * * * * % % % % % %            , + + + + +	X		
B 
B 
B 
B 
B 
B 
B 
B	B 	B 	B 	B 	BL 	B 	B 	B  == i   
    29   
 
 
 
 
0 
 
 
0
 
 
 
 
"89P"Q 
 
 
8e e e e e#:;R#S e e eP (' 	 ,  
i! i! i! i! i!I|/i! i! 
i! i! i!r@   