
    .`i2                        d Z ddlZddlmZmZmZ ddlmZmZ ddl	Z	ddl
mZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7 dZ8dZ9 G d de-          Z: G d de&          Z; G d de$e;                   Z< G d de%e;                   Z= ej>        e=e;e<           G d dej?        e1e2                      Z@dS ) zPyTorch Fuyu model.    N)IterableMappingSequence)	AnnotatedLiteral)BatchFeature
FuyuConfigFuyuImageProcessorFuyuProcessor)
VllmConfig)BaseDummyOptions)ColumnParallelLinear)PersimmonForCausalLM)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)ImageProcessorItems	ImageSizeMultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderWeightsMapper
flatten_bnmaybe_prefixic ik c                       e Zd ZU dZdZed         ed<   eej	         e
dd          f         ed<   eee          e
d          f         ed<   d	S )
FuyuImagePatchInputsz
    Dimensions:
        - bn: Batch size * number of images
        - bnp: Batch size * number of images * number of patches
        - fn: patch_size_x * patch_size_y * num_channels
    image_patchestypebnpfnimage_patches_flatbnpatches_per_imageN)__name__
__module____qualname____doc__r+   r   __annotations__r   torchTensorr   listint     s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/vllm/model_executor/models/fuyu.pyr)   r)   ;   s           &5D'/
"444!%,E40H0H"HIIII cKK,=,=!=>>>> r;   r)   c                       e Zd Zd ZdefdZdedefdZdee	e
dz  f         fdZde
d	e
dee
e
f         fd
Zde
d	e
de
fdZdefdZdS )FuyuProcessingInfoc                 @    | j                             t                    S N)ctxget_hf_configr	   selfs    r<   rB   z FuyuProcessingInfo.get_hf_configQ   s    x%%j111r;   kwargsc                 2     | j         j        t          fi |S r@   )rA   get_hf_processorr   rD   rE   s     r<   rG   z#FuyuProcessingInfo.get_hf_processorT   s    (tx(AA&AAAr;   returnc                 &     | j         di |j        S Nr:   )rG   image_processorrH   s     r<   get_image_processorz&FuyuProcessingInfo.get_image_processorW   s    $t$..v..>>r;   Nc                 
    ddiS )Nimager    r:   rC   s    r<   get_supported_mm_limitsz*FuyuProcessingInfo.get_supported_mm_limitsZ   s    |r;   image_widthimage_heightc                   |                                  }|j        d         }|j        d         }|j        d         }|j        d         }||k    r||k    s>||z  }||z  }	t          ||	          }
t	          ||
z            }t	          ||
z            }t          j        ||z            }t          j        ||z            }||fS )Nwidthheight)rM   size
patch_sizeminr9   mathceil)rD   rQ   rR   rL   target_widthtarget_heightpatch_widthpatch_heightheight_scale_factorwidth_scale_factoroptimal_scale_factorncolsnrowss                r<   get_image_feature_grid_sizez.FuyuProcessingInfo.get_image_feature_grid_size]   s     2244&+G4',X6%09&1(;|++0M0M"/,">!-!;#&':<N#O#O |.BBCCLk,@@AAK	+344	,566e|r;   c                @    |                      ||          \  }}||z  S )NrQ   rR   )rd   )rD   rQ   rR   rb   rc   s        r<   get_num_image_tokensz'FuyuProcessingInfo.get_num_image_tokensu   s4     77#% 8 
 
u
 u}r;   c                 x    |                                  }t          |j        d         |j        d                   S )NrT   rU   )rT   rU   )rM   r   rV   )rD   rL   s     r<   !get_image_size_with_most_featuresz4FuyuProcessingInfo.get_image_size_with_most_features   s?    2244!&w/8LX8V
 
 
 	
r;   )r1   r2   r3   rB   objectrG   r
   rM   r   strr9   rP   tuplerd   rg   r   ri   r:   r;   r<   r>   r>   P   s       2 2 2B B B B B?F ?7I ? ? ? ?cDj)A      	
 
sCx   0  	
 
   
9 
 
 
 
 
 
r;   r>   c            	       p    e Zd Zdeeef         defdZ	 ddedeeef         deeef         dz  defdZ	dS )	FuyuDummyInputsBuilder	mm_countsrI   c                     dS )N r:   )rD   ro   s     r<   get_dummy_textz%FuyuDummyInputsBuilder.get_dummy_text   s    rr;   Nseq_len
mm_optionsc                     | j                                         \  }}|                    dd          }|r|                    d          nd }d|                     ||||          iS )NrO   r   )rT   rU   
num_images	overrides)infori   get_get_dummy_images)rD   rs   ro   rt   r[   r\   rv   image_overridess           r<   get_dummy_mm_dataz(FuyuDummyInputsBuilder.get_dummy_mm_data   s|     '+i&Q&Q&S&S#m]]7A..
5?I*..111T T++"$%)	 ,  
 	
r;   r@   )
r1   r2   r3   r   rk   r9   rr   r   r   r|   r:   r;   r<   rn   rn      s        S(9 c     =A	
 

 38$
 C!112T9	

 

 
 
 
 
 
r;   rn   c            
            e Zd Zdedeeef         deeef         deeef         def
 fdZdee	         dee	         fdZ
d	ed
eeef         deeef         fdZded
eeef         dedee         fdZ xZS )FuyuMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsrI   c                    |sa| j                                                             |          }|                     |          }t	          t          |g          d          S t                                          ||||          }|d         }t          |          |d<   t          j
        d |D                       |d<   |S )N)	input_idspt)tensor_type)r   r   r   r   r*   c                 ,    g | ]}t          |          S r:   )len).0ps     r<   
<listcomp>z>FuyuMultiModalProcessor._call_hf_processor.<locals>.<listcomp>   s    +++SVV+++r;   r0   )rx   get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictsuper_call_hf_processorr&   r6   tensor)	rD   r   r   r   r   
prompt_idsprocessed_outputsr*   	__class__s	           r<   r   z*FuyuMultiModalProcessor._call_hf_processor   s      	P002299&AAJ==jIIJ
| < < <$OOOO!GG66!	 7 
 
 */:-7-F-F/*16++]+++2
 2
-. ! r;   prompt_tokensc                     | j                                         }|                                }|d         }|d         |k    r|                    |           |S )Nz<0x04>)rx   r   	get_vocabappend)rD   r   	tokenizervocabboa_token_ids        r<   r   z7FuyuMultiModalProcessor._apply_hf_processor_tokens_only   s[    
 I++--	##%%X,,  ...r;   	hf_inputshf_processor_mm_kwargsc                     |                     dt          j        d                    }t          t	          j        d|          t	          j        d                    S )Nr0   r   rO   )r*   r0   )ry   r6   emptyr   r   flat_from_sizesbatched)rD   r   r   r0   s       r<   _get_mm_fields_configz-FuyuMultiModalProcessor._get_mm_fields_config   s^    
 &MM*=u{1~~NN/?*  4;GDD	
 
 
 	
r;   mm_itemsout_mm_kwargsc                 *     j                                         }|j        t          t                    sJ  j                                         }|j        }t          |t                    sJ dt          f fd}t          d|g|          gS )Nitem_idxc                 "                        dt                    }|                    |           }j                            |j        |j                  \  }}t          g|z  t          gz   |z  }t          j
        |gz   t                    S )NrO   rf   )embed_token_id)	get_itemsr   get_image_sizerx   rd   rT   rU   _IMAGE_TOKEN_ID_NEWLINE_TOKEN_IDr   select_token_id)	r   images
image_sizerb   rc   image_tokensbos_token_idr   rD   s	         r<   get_replacement_fuyuzIFuyuMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_fuyu   s    ''1DEEF..x88J9@@&,'. A  LE5 --59J8KKuTL&6~-.   r;   rO   )modalitytargetreplacement)rx   rB   r   
isinstancer9   r   r   )	rD   r   r   r   	hf_configr   eot_token_idr   r   s	   ``      @r<   _get_prompt_updatesz+FuyuMultiModalProcessor._get_prompt_updates   s     I++--	 -,,,,,,I++--	 -,,,,,,	3 	 	 	 	 	 	 	 	   $~0  
 	
r;   )r1   r2   r3   rk   r   rj   r   r   r8   r9   r   r   r   r   r   r   r   r   __classcell__r   s   @r<   r~   r~      sB       !! f%! 3;'	!
 CK(! 
! ! ! ! ! !8Cy 
c   

 !(V 4
 
++	,	
 
 
 
#
%#
 !(V 4#
 -	#

 
,	#
 #
 #
 #
 #
 #
 #
 #
r;   r~   )rx   dummy_inputsc                   f    e Zd Z edddd          Zedededed	z  fd
            Zddde	def fdZ
deded	z  fdZdedefdZdedefdZ	 	 ddej        dej        ded	z  dej        d	z  def
dZdej        dej        d	z  fdZdeeeej        f                  dee         fdZ xZS )FuyuForCausalLMzvision_embed_tokens.zlanguage_model.model.zlanguage_model.lm_head.)zmodel.vision_embed_tokens.zmodel.language_model.zlm_head.)orig_to_new_prefixr   irI   Nc                 N    |                     d          rd S t          d          )NrO   z Only image modality is supported)
startswith
ValueError)clsr   r   s      r<   get_placeholder_strz#FuyuForCausalLM.get_placeholder_str  s,    w'' 	4;<<<r;   rq   )prefixvllm_configr   c                   t                                                       |j        j        }|j        }|j        j        }|| _        || _        |j        j        | _        t          | _
        |j        dz  |j        z  | _        |                     |d          5  t          | j        |j        |d          | _        d d d            n# 1 swxY w Y   |                     |          5  t'          |                    |j                  t+          |d                    | _        d d d            n# 1 swxY w Y   | j        j        | _        d S )N   rO   T)quant_configgather_outputlanguage_model)r   r   )r   __init__model_configr   r   multimodal_configconfigtext_config
vocab_sizer   image_token_idrW   num_channelsimage_feature_size_mark_tower_modelr   hidden_sizevision_embed_tokens_mark_language_modelr   with_hf_configr'   r   make_empty_intermediate_tensors)rD   r   r   r   r   r   r   s         r<   r   zFuyuForCausalLM.__init__  s   )3"/'4F!2 ,7-"("3Q"69L"L##K99 	 	';'")"	( ( (D$	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 &&{33 	 	"6'66v7IJJ#F,<==# # #D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ? 	,,,s$   #CCC(=D11D58D5rE   c                     |                     dd           }|                     dd           }|d S t          d||d| j        i          S )Nr*   r0   r-   )r+   r.   r0   resolve_bindings)popr)   r   )rD   rE   r*   r0   s       r<   _parse_and_validate_image_inputz/FuyuForCausalLM._parse_and_validate_image_input4  sb     

?D99"JJ':DAA 4# ,/"D$;<	
 
 
 	
r;   image_inputc                     |d         }|d         }|                      |          \  }}|                    |                                d          S )Nr.   r0   r   )dim)r   splittolist)rD   r   r.   r0   vision_embeddings_flat_s         r<   _process_image_inputz$FuyuForCausalLM._process_image_inputD  s\     ))=>'(;<$($<$<=O$P$P!%++,=,D,D,F,FA+NNNr;   c                 N     | j         di |}|g S |                     |          S rK   )r   r   )rD   rE   r   s      r<   embed_multimodalz FuyuForCausalLM.embed_multimodalN  s9    :d:DDVDDI((555r;   r   	positionsintermediate_tensorsinputs_embedsc                 @    |d }|                      ||||          }|S )N)r   r   r   r   )r   )rD   r   r   r   r   rE   hidden_statess          r<   forwardzFuyuForCausalLM.forwardU  s=      + M++!5'	 , 
 
 r;   r   c                 6    | j                             |          S r@   )r   compute_logits)rD   r   s     r<   r   zFuyuForCausalLM.compute_logitsh  s     "11-@@@r;   weightsc                 J    t          |           }|                    |          S r@   )r$   load_weights)rD   r   loaders      r<   r   zFuyuForCausalLM.load_weightsn  s#    "4((""7+++r;   )NN)r1   r2   r3   r%   hf_to_vllm_mapperclassmethodrk   r9   r   r   r   rj   r)   r   r!   r   r   r6   r7   r   r   r   r   rl   setr   r   r   s   @r<   r   r     s        &*@%<1
 
   =3 =3 =3: = = = [= BD 
 
 
z 
3 
 
 
 
 
 
<

		$
 
 
 
 O/O	O O O O6 64H 6 6 6 6 <@-1 < < 2D8	
 |d*    &A|A 
	A A A A,HU33D-E$F ,3s8 , , , , , , , ,r;   r   )Ar4   rY   collections.abcr   r   r   typingr   r   r6   torch.nnnntransformersr   r	   r
   r   vllm.configr   vllm.config.multimodalr   !vllm.model_executor.layers.linearr   $vllm.model_executor.models.persimmonr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   r   vllm.sequencer   vllm.utils.tensor_schemar   r   
interfacesr!   r"   r#   utilsr$   r%   r&   r'   r   r   r)   r>   rn   r~   register_processorModuler   r:   r;   r<   <module>r     s6  &    7 7 7 7 7 7 7 7 7 7 % % % % % % % %        T T T T T T T T T T T T " " " " " " 3 3 3 3 3 3 B B B B B B E E E E E E / / / / / /         
 V U U U U U U U U U                . - - - - - > > > > > > > > L L L L L L L L L L M M M M M M M M M M M M      <   *6
 6
 6
 6
 6
+ 6
 6
 6
r
 
 
 
 
34FG 
 
 
2\
 \
 \
 \
 \
56HI \
 \
 \
~ ('	'  
j, j, j, j, j,bi!3Z j, j, 
j, j, j,r;   