
     `ij                        d dl mZmZ d dlZd dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ  ej        e          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de	          Z% G d de
          Z& G d de          Z'g dZ(dS )     )OptionalUnionN)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )PretrainedConfig)FlashAttentionKwargs)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)logging   )CONFIG_MAPPING
AutoConfigc                       e Zd ZdS )InstructBlipVideoVisionConfigN__name__
__module____qualname__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   -           Dr   r   c                       e Zd ZdS )InstructBlipVideoQFormerConfigNr   r   r   r   r"   r"   1   r    r   r"   c                   j     e Zd ZdZdZddiZeeedZ		 	 	 	 	 d fd	Z
ed	ed
edefd            Z xZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configN    c                      t                      j        di | |i }t                              d           |i }t                              d           |i }t                              d           t	          di || _        t          di || _        |                    dd          }t          |         di || _
        || _        || _        | j        j        | j        _        | j
        j        t           v | _        d| _        d| _        d S )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   r*   r"   r)   getr   r(   num_query_tokensr'   hidden_sizeencoder_hidden_sizer-   r   use_decoder_only_language_modelinitializer_factorinitializer_range)	selfr*   r)   r(   r4   r'   kwargstext_model_type	__class__s	           r   r0   z InstructBlipVideoConfig.__init__w   s!    	""6""" MKKtuuu!NKKvwwwKKKnooo:KK]KK<NN~NN%//,>>)/:II[II 0!2262D2P//3/?/JNo/o,"%!%r   r*   r)   r(   c                      | d|                                 |                                 |                                 d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r*   r)   r(   r   )to_dict)clsr*   r)   r(   r;   s        r    from_vision_qformer_text_configsz8InstructBlipVideoConfig.from_vision_qformer_text_configs   sY      s 
'//11)1133#++--
 
 	
 
 	
r   )NNNr+   N)r   r   r   __doc__r-   attribute_mapr   r"   r   sub_configsr0   classmethodr   rA   __classcell__)r=   s   @r   r$   r$   5   s        5 5n %J-M "86 K !& !& !& !& !& !&F 
4
 7
 &	
 
 
 [
 
 
 
 
r   r$   c                       e Zd ZdS ) InstructBlipVideoPreTrainedModelNr   r   r   r   rH   rH      r    r   rH   c                       e Zd ZdS )InstructBlipVideoVisionModelNr   r   r   r   rJ   rJ      r    r   rJ   c                       e Zd ZdS )InstructBlipVideoQFormerModelNr   r   r   r   rL   rL      r    r   rL   c                       e Zd ZdS )4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   rN   rN      r    r   rN   c            !       N   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddej        dej        deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee         dee         dee         dedee         de	e
         deeef         fdZdS )InstructBlipVideoModelNFpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher;   returnc                    ||n| j         j        }|j        \  }}}}}|                    ||z  |||          }|                     ||	|
||          }|d         }t          j        |                                d d         t
          j        |j	                  }| j
                            |j        d         dd          }t          j        |                                d d         t
          j        |j	                  }|t          j        |          }|                    |d          }|                    |d          }t          j        ||gd          }|                     ||||||	|
|          }|d         d d d |                    d          d d f         }|                     |          }|                    || j         j        |z  d          }|I | j                                        |          }|| j         j        k    }|t          j        |          }nd| |                                 t          j        | j         j        t
          j        |j	                            k    }|                    d          }|                    d                              |                              |j	                  }|                    |j	        |j                  }|                    ||          }| j         j        r | j        d|||	|
||d|}n | j        d|||||	|
||d	|}t;          |||
          S )N)rQ   rY   rZ   r[   r\   r   dtypedevicedim   )rT   rU   query_embedsencoder_hidden_statesencoder_attention_maskrY   rZ   r[   rX   rU   rY   rZ   r[   r]   )rX   rU   rV   rW   rY   rZ   r[   r]   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongrc   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr4   language_modelget_input_embeddingsr&   tensorall	unsqueeze	expand_astorb   masked_scatterr7   rN   )r:   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r;   
batch_sizeframeschannelheightwidthrk   image_embedsimage_attention_maskrw   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputss                                 r   forwardzInstructBlipVideoModel.forward   s   " &1%<kk$+B] 6B5G2
FGVU#++J,?&RWXX**%/!5#%= + 
 
 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"-??A?NN!7!I!I&VW!I!X!X!&,@BX+Y_`!a!a!a'1%".#7/!5# % 	
 	
 %Q'+A\->->q-A-A+A111(DE !% 8 8 F F !6 = =j$+JfioJoqs t t FD/DDFFyQQM!*dk.H!H%!&!;!;!.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;/99"==GGVVYYZgZnoo 5 8 89M}Ob c c%445GI^__;6 	)d) +-"3%9'#   GG *d) 
+-"3'="3%9'#
 
 
 
G D))#*
 
 
 	
r   )NNNNNNNNNFN)r   r   r   rs   FloatTensorr   
LongTensorTensorboolr   r   r   tuplerN   r   r   r   r   rP   rP      sQ       
 >B15598<=A04,0/3&*).$(i
 i
'i
 !,i
 !))9 :	i

 E-.i
 !!12i
 $E$45i
 !))9 :i
  -i
 $D>i
 'tni
 d^i
 #'i
 D>i
 -.i
  
uJJ	K!i
 i
 i
 i
 i
 i
r   rP   c            #       <   e Zd Z	 	 	 ddej        dej        deej                 dee         dee         f
dZ	 	 	 ddej        dej        deej                 dee         dee         f
d	Z	d
ej        dej        fdZ
	 	 	 	 	 	 	 	 	 	 	 	 ddej        dej        deej                 d
eej                 deej                 deej                 deej                 deej                 dee         dee         deej                 dee         dedee         dee         deeef         f dZ ej                    	 	 	 	 	 	 ddej        deej                 deej                 d
eej                 deej                 deej                 dedej        fd            ZdS ))InstructBlipVideoForConditionalGenerationNFrQ   rR   rS   r\   r[   c                    |j         \  }}}}	}
|                    ||z  ||	|
          }|                     ||d          }|d         }t          j        |                                dd         t          j        |j                  }| j        	                    |j         d         dd          }t          j        |                                dd         t          j        |j                  }|t          j
        |          }|                    |d          }|                    |d          }t          j        ||gd          }|                     |||||d	          }|d         ddd|                    d          ddf         }|                     |          }|                    || j        j        |z  d          }|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)rQ   r\   r[   r   Nr`   ra   rd   rf   )rT   rU   rg   rh   ri   r[   )rp   rq   rr   rs   rt   ru   rv   rc   rw   rx   ry   rz   r{   r|   r}   rn   r4   )r:   rQ   rR   rS   r\   r[   r   r   r   r   r   rk   r   r   rw   r   r   r   r   s                      r   get_video_featuresz<InstructBlipVideoForConditionalGeneration.get_video_features0  s   " 6B5G2
FGVU#++J,?&RWXX**%%= + 
 

 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"-??A?NN!7!I!I&VW!I!X!X!&,@BX+Y_`!a!a!a'1%".#7 % 
 
 %Q'+A\->->q-A-A+A111(DE !% 8 8 F F !6 = =j$+JfioJoqs t t 	H(.-GG$$r   c                     d S )Nr   )r:   rQ   rR   rS   r\   r[   s         r   get_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_featuresl  s	     	r   rT   rX   c                 t   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                    d          	                    |          
                    |j                  }|S )zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        Nra   r`   )r   rs   r   rn   r&   rv   rc   r   r   r   r   )r:   rT   rX   r   s       r   get_placeholder_maskz>InstructBlipVideoForConditionalGeneration.get_placeholder_maskv  s     !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H/99"==GGVVYYZgZnoo!!r   rU   rV   rW   rY   rZ   labelsr]   r;   r^   c                    ||n| j         j        }|                     ||||d          \  }}}|s|                                n|}|s|                                n|}| |                                 |          }|t          j        |          }|                    |j        |j	                  }| 
                    ||          }|                    ||          }| j         j        rJ | j        d|||	|
||d|}|r|j        n|d         }d}|  | j        d||| j         j        j        d|}n9 | j        d|||||	|
|||d	|}|r|j        n|d         }|r|j        n|d	         }t'          |||||
          S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrR   rS   r\   r[   rX   rj   r   )logitsr   
vocab_size)	rX   rU   rV   rW   rY   rZ   r[   r   r]   rf   )lossr   rk   rl   rm   r   )rn   ro   r   to_tupler   rs   ry   r   rc   rb   r   r   r7   r~   r   loss_functionr(   r   r   rN   )r:   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   r[   r\   r]   r;   r   rk   r   r   r   r   r   s                          r   r   z1InstructBlipVideoForConditionalGeneration.forward  s=   b &1%<kk$+B]?C?V?V/#9%= @W @
 @
<~} ;FY00222>8CV..000 7D5577	BBM!"_Y77N 5 8 89M}Ob c c!66yP]6^^%445GI^__;6 	C)d) +-"3%9'#   G (3BW^^
FD!)t) !&T[=T=_ ci 
 *d) +-"3'="3%9'#   G $/>7<<GAJD'2BW^^
FC))#*
 
 
 	
r   c                    t          | d          r|                                  |j        d         }	|                     ||||d          \  }
}}||o| j        j        g| j        j        z  dz  }|| j        j        j        gz   }t          j
        |gt          j        |j                  }|                    |	d          } |                                 |          }|t          j        |          }|
                    |j        |j                  }
|                     ||	          }|                    ||
          }||d
}| j        j        j        s||d<    | j        j        di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   Tr   N   ra   rf   r   )rX   rU   rT   r   )hasattr_preprocess_acceleraterp   r   rn   r'   r4   r(   bos_token_idrs   r   rv   rc   repeatr   ry   r   rb   r   r   r~   is_encoder_decodergenerate)r:   rQ   rR   rS   rT   rU   rX   r\   generate_kwargsr   r   rk   r   video_tokensstart_tokensr   inputsr   s                     r   r   z2InstructBlipVideoForConditionalGeneration.generate  s   D 4)) 	*'')))!'*
?C?V?V/#9%= @W @
 @
<~}    $ =>A]]`aa+t{/F/S.TT!L,uzR^Refff	%,,Z;;	7D5577	BBM!"_Y77N 5 8 89M}Ob c c!66yP]6^^%445GI^__#0NSS")< 	,"+F;.$%.KKK?KKr   )NFF)NNNNNNNNNNFN)NNNNNF)r   r   r   rs   r   r   r   r   r   r   r   r   r   r   r   rN   r   no_gradr   r   r   r   r   r   /  s2       
 >B38&+9% 9%'9% !+9% !))9 :	9%
 #+4.9% d^9% 9% 9% 9%@ >B38&+ ' !+ !))9 :	
 #+4. d^   "e.> "uO` " " " "& >B15598<=A59,0/3-1&*).$(N
 N
'N
 !,N
 !))9 :	N

 E-.N
 !!12N
 $E$45N
 !))9 :N
   12N
 $D>N
 'tnN
 )*N
 d^N
 #'N
 D>N
  +,!N
" 
uJJ	K#N
 N
 N
 N
` U]__ 9==A045959).C C'C $E$45C !))9 :	C
 E,-C !!12C   12C #'C 
	C C C _C C Cr   r   )r$   r"   r   rJ   rH   rL   rP   r   ))typingr   r   rs   ;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr   r   r	   r
   r   r   r   configuration_utilsr   modeling_flash_attention_utilsr   models.auto.modeling_autor   processing_utilsr   utilsr   autor   r   
get_loggerr   r1   r   r"   r$   rH   rJ   rL   rN   rP   r   __all__r   r   r   <module>r      s    # " " " " " " "                          4 3 3 3 3 3 B B B B B B J J J J J J & & & & & &       - - - - - - - - 
	H	%	%	 	 	 	 	$< 	 	 		 	 	 	 	%> 	 	 	z
 z
 z
 z
 z
. z
 z
 z
z	 	 	 	 	'B 	 	 		 	 	 	 	#: 	 	 		 	 	 	 	$< 	 	 		 	 	 	 	;j 	 	 	j
 j
 j
 j
 j
. j
 j
 j
Zj j j j j0T j j jZ		 	 	r   