
     `iP                        d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ e ed           G d de                                  Ze ed           G d de                                  Z G d dej                  Ze G d de                      Z ed           G d de                      Z ed           G d d ee                      Zg d!Z dS )"    )	dataclass)OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tuple   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    )custom_introc                   8    e Zd ZU dZdZeej                 ed<   dS )VipLlavaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   &   s7         	 	 8<%"34;;;;;r    r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dZeeej                          ed<   dZeej                 ed<   dS )	VipLlavaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r$   r   r   r   r   r%   r&   r	   r'   tupler(   r   r   r    r!   r#   r#   ;   s           )-D(5$
%,,,*.FHU&'...'+OXe_+++8<M8E%"345<<<59Ju0129997;%"34;;;;;r    r#   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                    t                                                       t          |j        t                    rdnt          |j                  }t          j        ||j        j	        z  |j
                  | _        t          j        ||j        j	        z  |j        j	        d          | _        t          |j                 | _        t          j        |j        j	        |j        j	        d          | _        d S )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr,   num_feature_layers	__class__s      r!   r2   z$VipLlavaMultiModalProjector.__init__Z   s    ",V-I3"O"OvQQUXY_YuUvUv#%<!5!AAvGe$
 $
 $
  	!5!AA*
 
 

 &56	&"4"@&BTB`gklllr    c                     |                      |          }|                     |          }|                     |          }|                     |          }|S N)r;   r>   r@   rA   )rB   r'   s     r!   forwardz#VipLlavaMultiModalProjector.forwardi   sN    00??m44//m44r    )r   r   r   r   r2   rG   __classcell__rD   s   @r!   r+   r+   Y   sZ        m~ m m m m m m      r    r+   c                   :    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdS )VipLlavaPreTrainedModelr,    Tr&   N)r   r   r   r   r   base_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr   r    r!   rK   rK   q   sM         &*#"3N!"&r    rK   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                       e Zd ZddiZdef fdZd Zd Zd Zd Z		 dd
e
j        deeeee         f                  fdZde
j        de
j        de
j        fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                 d
ee
j                 dee
j                 dee
j                 dee         dee
j                 deeeee         f                  dee         dee         dee         dee         dee
j                 deeef         fd            Z xZS )VipLlavaModelzlanguage_model.modellanguage_modelr,   c                    t                                          |           t          j        |j                  | _        t          |          | _        t          j        |j                  | _	        | 
                                 d S rF   )r1   r2   r   from_configr8   vision_towerr+   multi_modal_projectorr=   rW   	post_initrB   r,   rD   s     r!   r2   zVipLlavaModel.__init__   sm       %1&2FGG%@%H%H"'3F4FGGr    c                 4    | j                                         S rF   )rW   get_input_embeddingsrB   s    r!   r_   z"VipLlavaModel.get_input_embeddings   s    "77999r    c                 :    | j                             |           d S rF   )rW   set_input_embeddingsrB   values     r!   rb   z"VipLlavaModel.set_input_embeddings   s    0077777r    c                     || _         d S rF   rW   rB   decoders     r!   set_decoderzVipLlavaModel.set_decoder   s    %r    c                     | j         S rF   rf   r`   s    r!   get_decoderzVipLlavaModel.get_decoder   s    ""r    Npixel_valuesr4   c                 &   ||n| j         j        }|                     |d          t          |t                    rj        |         ddddf         }n$fd|D             }t          j        |d          }|                     |          }|S )aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   c                 B    g | ]}j         |         d d dd f         S )Nr   )r'   ).0indeximage_outputss     r!   
<listcomp>z4VipLlavaModel.get_image_features.<locals>.<listcomp>   s2    kkkEm9%@ABBGkkkr    )dim)	r,   r4   rZ   r3   r5   r'   r   catr[   )rB   rl   r4   image_featuresrr   s       @r!   get_image_featuresz VipLlavaModel.get_image_features   s      &;%F!!DKLm 	 )),T)RR +S11 	?*89NOPQPQPQSTSUSUPUVNN lkkkUjkkkN"Y~2>>>N33NCCr    	input_idsinputs_embedsrw   c                 \   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                                }|	                    d          
                    |                              |j                  }|j        d         |j        d         z  }||                                         |                                k    rt          d| d|           |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)dtypedevicert   r   r   z6Image features and image tokens do not match: tokens: z, features )r_   r   tensorr,   image_token_idlongr}   allsum	unsqueeze	expand_astoshapenumel
ValueError)rB   ry   rz   rw   special_image_maskn_image_tokensn_image_featuress          r!   get_placeholder_maskz"VipLlavaModel.get_placeholder_mask   s/    !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H+//11/99"==GGVVYYZgZnoo)/2^5I!5LL+,22448L8L8N8NNNvvvdtvv   "!r    attention_maskposition_idsr&   	use_cacheoutput_attentionsrn   return_dictcache_positionreturnc                 t   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|du |duz  rt          d          | |                                 |          }|e|                     ||          }|                    |j	        |j
                  }|                     |||          }|                    ||          } | j        d||||||	|
d|d	|}t          |j        |j        |j        |j        ||nd          }|r|n|                                S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsrl   r4   )rz   rw   T)	r   r   r&   rz   r   r   rn   r   r   )last_hidden_stater&   r'   r(   r   r   )r,   r   rn   use_return_dictr4   r   r_   rx   r   r}   r|   r   masked_scatterrW   r   r   r&   r'   r(   to_tuple)rB   ry   rl   r   r   r&   rz   r4   r   r   rn   r   r   	lm_kwargsrw   r   outputsoutputs                     r!   rG   zVipLlavaModel.forward   s   , 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]%:%F!!DKLm 	 -t";< 	[YZZZ 7D5577	BBM#!44)AV 5  N ,..}/C]EXYYN!%!:!:~ "; " " *889K^\\M%$% 
)%+'/!5)
 
 
 
 -%7#3!/)2>2JPT
 
 
 %;vv&//*;*;;r    rF   )NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r2   r_   rb   ri   rk   r   r   r   r   r5   listrx   
LongTensorr   r   Tensorr	   boolr)   r   rG   rH   rI   s   @r!   rV   rV      sN        '=>N%O"~      : : :8 8 8& & &# # # im !-FNuUXZ^_bZcUcOdFe   >")":?:K"]b]n" " " "0  15481537+/59AE$(,0/3&*59B< B<E,-B< u01B< !.	B<
 u/0B< "%B<   12B<  (c49n(=>B< D>B< $D>B< 'tnB< d^B< !!12B< 
u11	2B< B< B< ^B< B< B< B< B<r    rV   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c            #           e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j
        fdZd Zd Z	 d%dej        deeeee         f                  fdZed             Zed             Zed             Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&deej                 deej                 deej                 deej                 dee         deej                 deeeee         f                  deej                 dee         dee         dee         d ee         d!eej                 d"eeej        f         deee f         fd#                        Z!	 	 	 	 	 	 d' fd$	Z" xZ#S )( VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr,   c                     t                                          |           t          |          | _        t	          j        |j        j        |j        j        d          | _	        | 
                                 d S )NFr/   )r1   r2   rV   modelr   r<   r=   r9   
vocab_sizer   r\   r]   s     r!   r2   z)VipLlavaForConditionalGeneration.__init__'  se       "6**
y!3!?ASA^ejkkkr    c                 4    | j                                         S rF   )r   r_   r`   s    r!   r_   z5VipLlavaForConditionalGeneration.get_input_embeddings-  s    z..000r    c                 :    | j                             |           d S rF   )r   rb   rc   s     r!   rb   z5VipLlavaForConditionalGeneration.set_input_embeddings0  s    
''.....r    r   c                     | j         S rF   )r   r`   s    r!   get_output_embeddingsz6VipLlavaForConditionalGeneration.get_output_embeddings3  s
    |r    c                 :    | j                             |           d S rF   )r   ri   rg   s     r!   ri   z,VipLlavaForConditionalGeneration.set_decoder6  s    
w'''''r    c                 4    | j                                         S rF   )r   rk   r`   s    r!   rk   z,VipLlavaForConditionalGeneration.get_decoder9  s    z%%'''r    Nrl   r4   c                 :    | j                             ||          S )Nr   )r   rx   )rB   rl   r4   s      r!   rx   z3VipLlavaForConditionalGeneration.get_image_features<  s      z,,,^s,tttr    c                     | j         j        S rF   )r   rW   r`   s    r!   rW   z/VipLlavaForConditionalGeneration.language_modelB  s    z((r    c                     | j         j        S rF   )r   rZ   r`   s    r!   rZ   z-VipLlavaForConditionalGeneration.vision_towerF  s    z&&r    c                     | j         j        S rF   )r   r[   r`   s    r!   r[   z6VipLlavaForConditionalGeneration.multi_modal_projectorJ  s    z//r    r   ry   r   r   r&   rz   labelsr   r   rn   r   r   logits_to_keepc                    |
|
n| j         j        }
||n| j         j        }||n| j         j        }||n| j         j        } | j        d|||||||	||
|d|d|}|d         }t          |t                    rt          | d          n|}| 	                    |dd|ddf                   }d}|'| 
                    ||| j         j        j                  }t          |||j        |j        |j        |j                  S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)ry   rl   r   r   r&   rz   r   r4   r   rn   r   r   r   )r%   r   r   )r$   r%   r&   r'   r(   r   r   )r,   r   rn   r   r4   r   r3   r5   slicer   loss_functionr=   r   r#   r&   r'   r(   r   )rB   ry   rl   r   r   r&   rz   r4   r   r   r   rn   r   r   r   r   r   r'   slice_indicesr%   r$   s                        r!   rG   z(VipLlavaForConditionalGeneration.forwardN  si   l 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]%:%F!!DKLm 	 $* 
%)%+'"7/!5)
 
 
 
   
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%%VFt{OfOq%rrD-#3!/) ' ;
 
 
 	
r    c           	      j     t                      j        |f|||||d|}	|d         dk    r||	d<   |	S )N)r&   rz   r   r   r   r   rl   )r1   prepare_inputs_for_generation)rB   ry   r&   rz   rl   r   r   r   kwargsmodel_inputsrD   s             r!   r   z>VipLlavaForConditionalGeneration.prepare_inputs_for_generation  sg     =uww<
+')))
 
 
 
 !!! ,8L(r    rF   )NNNNNNNNNNNNNr   )NNNNNN)$r   r   r   r   _tied_weights_keysr   r2   r_   rb   r   Moduler   ri   rk   r   r   r   r   r5   r   rx   propertyrW   rZ   r[   r   r   r   r   r	   r   r)   r#   rG   r   rH   rI   s   @r!   r   r     s        "8-"?#,	& &" ++~      1 1 1/ / /ry    ( ( (( ( ( imu u!-uFNuUXZ^_bZcUcOdFeu u u u ) ) X) ' ' X' 0 0 X0  15481537+/59AE-1$(,0/3&*5934]
 ]
E,-]
 u01]
 !.	]

 u/0]
 "%]
   12]
  (c49n(=>]
 )*]
 D>]
 $D>]
 'tn]
 d^]
 !!12]
 c5</0]
" 
u44	5#]
 ]
 ]
 ^ ]
D          r    r   )rV   r   rK   )!dataclassesr   typingr   r   r   r   activationsr   cache_utilsr	   
generationr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   autor   configuration_vipllavar   r   r#   r   r+   rK   rV   r   __all__r   r    r!   <module>r      s  , " ! ! ! ! ! " " " " " " " "        ! ! ! ! ! !             ) ) ) ) ) ) D D D D D D D D - - - - - - 5 5 5 5 5 5 5 5       2 2 2 2 2 2   
< < < < <"9 < <  <   
< < < < <[ < <  <0    ")   0 ' ' ' ' 'o ' ' '   
Q< Q< Q< Q< Q<+ Q< Q< 
Q<h   
m m m m m'> m m 
m` [
Z
Zr    