
     `i30                     T   d dl mZmZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZ  ej        e          Z G d de	          Z G d de          Z G d dej                  Z G d de
          Z G d de          Z G d de          Zg dZdS )    )OptionalUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)auto_docstringlogging   )VipLlavaConfigc                       e Zd ZdS )VipLlavaModelOutputWithPastN__name__
__module____qualname__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   &           Dr   r   c                       e Zd ZdS )VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   *   r   r   r   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                    t                                                       t          |j        t                    rdnt          |j                  }t          j        ||j        j	        z  |j
                  | _        t          j        ||j        j	        z  |j        j	        d          | _        t          |j                 | _        t          j        |j        j	        |j        j	        d          | _        d S )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr    num_feature_layers	__class__s      r   r%   z$VipLlavaMultiModalProjector.__init__/   s    ",V-I3"O"OvQQUXY_YuUvUv#%<!5!AAvGe$
 $
 $
  	!5!AA*
 
 

 &56	&"4"@&BTB`gklllr   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S N)r.   r1   r3   r4   )r5   hidden_statess     r   forwardz#VipLlavaMultiModalProjector.forward>   sN    00??m44//m44r   )r   r   r   r   r%   r;   __classcell__)r7   s   @r   r   r   .   sZ        m~ m m m m m m      r   r   c                       e Zd ZdS )VipLlavaPreTrainedModelNr   r   r   r   r>   r>   F   r   r   r>   c                      e Zd Z	 ddej        deeeee         f                  fdZ	e
	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee         d	eej                 deeeee         f                  d
ee         dee         dee         dee         deej                 deeef         fd            ZdS )VipLlavaModelNpixel_valuesr'   c                 &   ||n| j         j        }|                     |d          t          |t                    rj        |         ddddf         }n$fd|D             }t          j        |d          }|                     |          }|S )aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   c                 B    g | ]}j         |         d d dd f         S )Nr   )r:   ).0indeximage_outputss     r   
<listcomp>z4VipLlavaModel.get_image_features.<locals>.<listcomp>e   s2    kkkEm9%@ABBGkkkr   )dim)	r    r'   vision_towerr&   r(   r:   torchcatmulti_modal_projector)r5   rA   r'   image_featuresrG   s       @r   get_image_featuresz VipLlavaModel.get_image_featuresK   s      &;%F!!DKLm 	 )),T)RR +S11 	?*89NOPQPQPQSTSUSUPUVNN lkkkUjkkkN"Y~2>>>N33NCCr   	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrC   return_dictcache_positionreturnc                 t   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|du |duz  rt          d          | |                                 |          }|e|                     ||          }|                    |j	        |j
                  }|                     |||          }|                    ||          } | j        d||||||	|
d|d	|}t          |j        |j        |j        |j        ||nd          }|r|n|                                S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsrA   r'   )rU   rO   T)	rR   rS   rT   rU   rV   rW   rC   rX   rY   )last_hidden_staterT   r:   
attentionsimage_hidden_statesr   )r    rW   rC   use_return_dictr'   
ValueErrorget_input_embeddingsrP   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   r]   rT   r:   r^   to_tuple)r5   rQ   rA   rR   rS   rT   rU   r'   rV   rW   rC   rX   rY   	lm_kwargsrO   special_image_maskoutputsoutputs                     r   r;   zVipLlavaModel.forwardj   s   , 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]%:%F!!DKLm 	 -t";< 	[YZZZ 7D5577	BBM#!44)AV 5  N ,..}/C]EXYYN!%!:!:~ "; " " *889K^\\M%$% 
)%+'/!5)
 
 
 
 -%7#3!/)2>2JPT
 
 
 %;vv&//*;*;;r   r9   )NNNNNNNNNNNN)r   r   r   rL   FloatTensorr   r   r(   listrP   r   
LongTensorTensorr   booltupler   r;   r   r   r   r@   r@   J   s       hl !-FNuUXZ^_bZcUcOdFe   >  15481537+/59AE$(,0/3&*59B< B<E,-B< u01B< !.	B<
 u/0B< "%B<   12B<  (c49n(=>B< D>B< $D>B< 'tnB< d^B< !!12B< 
u11	2B< B< B< ^B< B< B<r   r@   c            !          e Zd Z	 ddej        deeeee         f                  fdZ		 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej
                 deej                 deej                 deej
                 d	ee         d
eej                 deeeee         f                  deej
                 dee         dee         dee         dee         deej
                 deeej        f         deeef         fdZdS ) VipLlavaForConditionalGenerationNrA   r'   c                 :    | j                             ||          S )Nr\   )modelrP   )r5   rA   r'   s      r   rP   z3VipLlavaForConditionalGeneration.get_image_features   s      z,,,^s,tttr   r   rQ   rR   rS   rT   rU   labelsrV   rW   rC   rX   rY   logits_to_keeprZ   c                    |
|
n| j         j        }
||n| j         j        }||n| j         j        }||n| j         j        } | j        d|||||||	||
|d|d|}|d         }t          |t                    rt          | d          n|}| 	                    |dd|ddf                   }d}|'| 
                    ||| j         j        j                  }t          |||j        |j        |j        |j                  S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rQ   rA   rR   rS   rT   rU   rV   r'   rW   rC   rX   rY   r   )logitsrx   
vocab_size)lossr{   rT   r:   r^   r_   r   )r    rW   rC   r`   r'   rw   r&   r(   slicelm_headloss_functionr0   r|   r   rT   r:   r^   r_   )r5   rQ   rA   rR   rS   rT   rU   r'   rx   rV   rW   rC   rX   rY   ry   rj   rl   r:   slice_indicesr{   r}   s                        r   r;   z(VipLlavaForConditionalGeneration.forward   si   h 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]%:%F!!DKLm 	 $* 
%)%+'"7/!5)
 
 
 
   
8B>SV8W8Wk~ot444]kmAAA}aaa,?@AA%%VFt{OfOq%rrD-#3!/) ' ;
 
 
 	
r   r9   )NNNNNNNNNNNNNr   )r   r   r   rL   rn   r   r   r(   ro   rP   rp   rq   r   rr   rs   r   r;   r   r   r   ru   ru      s       hlu u!-uFNuUXZ^_bZcUcOdFeu u u u 15481537+/59AE-1$(,0/3&*5934]
 ]
E,-]
 u01]
 !.	]

 u/0]
 "%]
   12]
  (c49n(=>]
 )*]
 D>]
 $D>]
 'tn]
 d^]
 !!12]
 c5</0]
" 
u44	5#]
 ]
 ]
 ]
 ]
 ]
r   ru   )r@   ru   r>   )typingr   r   rL   r   (transformers.models.llava.modeling_llavar   r   r   r	   r
   activationsr   cache_utilsr   utilsr   r   configuration_vipllavar   
get_loggerr   loggerr   r   Moduler   r>   r@   ru   __all__r   r   r   <module>r      s
    # " " " " " " "                     " ! ! ! ! !             , , , , , , , , 2 2 2 2 2 2 
	H	%	%	 	 	 	 	": 	 	 		 	 	 	 	%@ 	 	 	    ")   0	 	 	 	 	2 	 	 	c< c< c< c< c<J c< c< c<Lc
 c
 c
 c
 c
'D c
 c
 c
L [
Z
Zr   