
     `iF                     `   d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ  ej        e          Ze G d de                      Z ed           G d de                      Z ed           G d dee	                      Zg dZdS )zPyTorch Fuyu model.    )OptionalUnionN)nn   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)auto_docstringcan_return_tuplelogging   )
FuyuConfigc                   @    e Zd ZU eed<   dZdZdZdZdZ	dZ
g ZdZd ZdS )FuyuPreTrainedModelconfigfuyuTpast_key_valuesc                    | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 d S d S t          |t          j
                  rS|j        j                            d|           |j        -|j        j        |j                 	                                 d S d S d S )Ng        )meanstd)r   initializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler   s      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weightsz!FuyuPreTrainedModel._init_weights.   s    k+fbi(( 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--    N)__name__
__module____qualname__r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr&    r'   r%   r   r   "   s`         &*#"&N"3	? 	? 	? 	? 	?r'   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                       e Zd ZddiZdef fdZd Zd Zd Zd Z	d	e
j        d
ee
j                 de
j        de
j        fdZde
j        fdZde
j        de
j        de
j        fdZe	 	 	 	 	 	 	 	 	 	 	 ddee
j                 dee
j                 dee
j                 dee
j                 dee
j                 dee         dee
j                 dee         dee         dee         dee         deeef         fd            Z xZS ) 	FuyuModelzlanguage_model.modellanguage_modelr   c                 ^   t                                          |           |j        | _        |j        j        | _        t          j        |j                  | _        t          j
        |j        |j        z  |j        z  |j                  | _        d| _        |                                  d S )NF)super__init__pad_token_idr"   text_config
vocab_sizer   from_configr8   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr#   r   	__class__s     r%   r;   zFuyuModel.__init__B   s       !. ,7'3F4FGG#%9 11F4GGI[$
 $
  ',#r'   c                 4    | j                                         S N)r8   get_input_embeddingsr#   s    r%   rJ   zFuyuModel.get_input_embeddingsO   s    "77999r'   c                 :    | j                             |           d S rI   )r8   set_input_embeddingsr#   values     r%   rM   zFuyuModel.set_input_embeddingsR   s    0077777r'   c                     || _         d S rI   r8   r#   decoders     r%   set_decoderzFuyuModel.set_decoderU   s    %r'   c                     | j         S rI   rQ   rK   s    r%   get_decoderzFuyuModel.get_decoderX   s    ""r'   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
      R   |j         d         t          |          k    s-t          dt          |          d|j         d                   |                                }t	          |j         d                   D ]}t          j        ||         dk    d          d         }||         |         }|j         d         ||         j         d         k    r)t          d||         j         d|j         d| d	          ||         |                             |j                  |||f<   |S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r#   rW   rX   rY   output_embeddings	batch_idxdst_indicessrc_indicess           r%   gather_continuous_embeddingsz&FuyuModel.gather_continuous_embeddings[   s{   (  %a(C0E,F,FFFls3H/I/IllQ`QfghQill   ,11334Q788 	 	I  -(A)(LPQ(Q\`aaabcdK 4I>{KK #&;I&F&LQ&OOO i7LY7W7] i i6A6Gi i\ei i i   9Ni8XYd8e8h8h!(9 9i455 ! r'   pixel_valuesc                 $      fd|D             }|S )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        c                     g | ]L}                     |                    j         j        j                                                d           MS )r   )rC   re   r   dtypesqueeze).0patchr#   s     r%   
<listcomp>z0FuyuModel.get_image_features.<locals>.<listcomp>   s[     
 
 
 $$UXXd.F.M.S%T%TUU]]^_``
 
 
r'   r4   )r#   rl   kwargspatch_embeddingss   `   r%   get_image_featureszFuyuModel.get_image_features   s4    
 
 
 
%
 
 
  r'   	input_idsinputs_embedsimage_featuresc                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }|j        d         |j        d         z  }||                                         |                                k    rt          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nro   rf   r   r   z6Image features and image tokens do not match: tokens: z, features )rJ   rc   tensorr   image_token_idlongrf   allsum	unsqueeze	expand_asre   r^   numelr`   )r#   rw   rx   ry   special_image_maskn_image_tokensn_image_featuress          r%   get_placeholder_maskzFuyuModel.get_placeholder_mask   s/    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo)/2^5I!5LL+,22448L8L8N8NNNvvvdtvv   "!r'   Nimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                 .   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }||t          d          ||j        \  }}n||j        \  }}}nt          d          |b||j        n|j        }||                                nd}t          j
        |||z   t          j        |          }|                    d          }|" | j                                        |          }|w|                     |          }t          j        |d                              |j        |j                  }|                     |||          }|                    ||          } | j        d	|||||	|
||d|}|S )
a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   r{   )dim)rx   ry   )rx   r   r   r   r   r   r   r   r4   )r   r   r   r   use_return_dictr`   r^   rf   get_seq_lengthrc   aranger   r   r8   rJ   rv   catre   ro   r   masked_scatter)r#   rw   r   r   r   r   r   rx   r   r   r   r   rt   
batch_size
seq_length_rf   past_key_values_lengthru   r   outputss                        r%   forwardzFuyuModel.forward   s   0 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ]%>cddd"%._"J

&(5(;%J
AASTTT)2)>Y%%MDXFIXId_%C%C%E%E%Ejk" <&
5K(KSXS]fl  L (11!44L FD/DDFFyQQM$#66}EE$y)9qAAADD]EY[h[noo!%!:!:GW "; " " *889KM]^^M%$% 

')%+/!5#

 

 

 

 r'   )NNNNNNNNNNN)r(   r)   r*   _checkpoint_conversion_mappingr   r;   rJ   rM   rT   rV   rc   Tensorlistrk   FloatTensorrv   
LongTensorr   r   r   r   boolr   tupler	   r   __classcell__rG   s   @r%   r7   r7   :   sH        '=>N%O"z      : : :8 8 8& & &# # #*!*!  $EL1*! $)<	*!
 
*! *! *! *!X u/@        ")":?:K"]b]n" " " "0  15048<1537+/59$(,0/3&*G GE,-G  -	G
  (5G !.G u/0G "%G   12G D>G $D>G 'tnG d^G 
u,,	-G G G ^G G G G Gr'   r7   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c            !           e Zd ZddddZdgZdef fdZd Zd	 Zd
 Z	d Z
ee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 dee         deej                 dee         deej                 dee         dee         dee         dee         deeef         fd                        Z	 	 	 	 	 	 d fd	Z xZS ) FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S )NF)r   )r:   r;   r7   modelr   r   r=   rB   r>   r   rE   rF   s     r%   r;   zFuyuForCausalLM.__init__  se       v&&
y!3!?ASA^ejkkkr'   c                 4    | j                                         S rI   )r   rJ   rK   s    r%   rJ   z$FuyuForCausalLM.get_input_embeddings  s    z..000r'   c                 :    | j                             |           d S rI   )r   rM   rN   s     r%   rM   z$FuyuForCausalLM.set_input_embeddings  s    
''.....r'   c                 :    | j                             |           d S rI   )r   rT   rR   s     r%   rT   zFuyuForCausalLM.set_decoder  s    
w'''''r'   c                 4    | j                                         S rI   )r   rV   rK   s    r%   rV   zFuyuForCausalLM.get_decoder  s    z%%'''r'   Nr   rw   r   r   r   r   r   rx   r   labelsr   r   r   logits_to_keeprZ   c                    |
|
n| j         j        }
||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||||||
||d          }|d         }t          |t                    rt          | d          n|}| 	                    |dd|ddf                   }d}|	  | j
        d||	| j         j        j        d|}t          |||j        |j        |j                  S )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rw   r   r   rx   r   r   r   r   r   r   r   r   )logitsr   r>   )lossr   r   hidden_states
attentionsr4   )r   r   r   r   r   r   r   intslicer   loss_functionr=   r>   r	   r   r   r   )r#   rw   r   r   r   r   r   rx   r   r   r   r   r   r   rt   r   r   slice_indicesr   r   s                       r%   r   zFuyuForCausalLM.forward  s_   l 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]**'"7')%+/!5  
 
  
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%4% f9P9[ _e D &#3!/)
 
 
 	
r'   c           
      v     t                      j        |f||||||d|}	|d         dk    r
d |	d<   d |	d<   |	S )N)r   r   rx   r   r   cache_positionr   r   r   )r:   prepare_inputs_for_generation)r#   rw   r   r   rx   r   r   r   rt   model_inputsrG   s             r%   r   z-FuyuForCausalLM.prepare_inputs_for_generationw  ss     =uww<	
+)''"7)	
 	
 	
 	
 !!!48L01,0L)r'   )NNNNNNNNNNNNr   )NNNNNN)r(   r)   r*   r   _tied_weights_keysr   r;   rJ   rM   rT   rV   r   r   r   rc   r   r   r   r   r   r   r   r   r	   r   r   r   r   s   @r%   r   r      s#        "8 ;#,& &"
 ++z      1 1 1/ / /( ( (( ( (  15048<1537+/59$()-,0/3&*()\
 \
E,-\
  -	\

  (5\
 !.\
 u/0\
 "%\
   12\
 D>\
 &\
 $D>\
 'tn\
 d^\
 !\
" 
u,,	-#\
 \
 \
 ^ \
B "         r'   r   )r   r   r7   )__doc__typingr   r   rc   r   cache_utilsr   
generationr   modeling_outputsr	   modeling_utilsr
   models.auto.modeling_autor   utilsr   r   r   configuration_fuyur   
get_loggerr(   loggerr   r7   r   __all__r4   r'   r%   <module>r      s     " " " " " " " "                    ) ) ) ) ) ) 6 6 6 6 6 6 - - - - - - 2 2 2 2 2 2 > > > > > > > > > > * * * * * * 
	H	%	% ? ? ? ? ?/ ? ? ?.   
v v v v v# v v 
vr   
W W W W W)? W W 
Wt B
A
Ar'   