
     `iw                       d dl Z d dlmZ d dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1  e%j2        e3          Z4 G d de	j5                  Z6e# G d de                      Z7	 dHde	j5        dej8        dej8        dej8        deej8                 de9de9fd Z: G d! d"e	j5                  Z; G d# d$e	j5                  Z< G d% d&e          Z= G d' d(e	j5                  Z> G d) d*e7          Z? G d+ d,e	j5                  Z@ G d- d.e	j5                  ZA G d/ d0e	j5                  ZB G d1 d2e	j5                  ZC G d3 d4e	j5                  ZD G d5 d6e          ZE G d7 d8e	j5                  ZF G d9 d:e	j5                  ZG G d; d<e7          ZHe e#d=>           G d? d@e!                                  ZI e#dA>           G dB dCe7                      ZJ e#dD>           G dE dFe7e                      ZKg dGZLdS )I    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej	        d
e
dej        fdZ xZS )!InstructBlipVideoVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        dd| j                            | _        t          j        d| j        | j        | j                  | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        t          j
        d| j        | j                            | _        d S )Nr$   r	   )in_channelsout_channelskernel_sizestrider    )super__init__r*   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr*   	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr1   z*InstructBlipVideoVisionEmbeddings.__init__8   s    + + +!|EK1dn,M,MNN!yDO\`\k 
  
  
 !Ot>1D!-1"$,u{1d>PRVR`/a/a"b"b    
embeddingsheightwidthreturnc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r$   Ng      ?r   r	   r    bicubicF)sizemodealign_cornersdim)shaper>   r7   jit
is_tracingr5   r   reshapepermuter   
functionalinterpolateviewcat)r@   rD   rE   rF   r<   r=   class_pos_embedpatch_pos_embedrO   
new_height	new_widthsqrt_num_positionss               rB   interpolate_pos_encodingz:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingJ   sr    !&q)A-/5a81< y##%% 	++*F*F6UZ??**1!!!RaR%81!!!QRR%8r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCrC   Fpixel_valuesr^   c                 (   |j         \  }}}}| j        j        j        }|                     |                    |                    }|                    d                              dd          }| j                            |dd                              |          }	t          j
        |	|gd          }
|r|                     |
||          }n| j        }|
|d d d |
                    d          d d f                             |          z   }
|
S )Ndtyper    r$   rI   rN   )rP   r;   weightrb   toflatten	transposer9   expandr7   rX   r^   r>   rK   )r@   r_   r^   
batch_size_rE   rF   target_dtypepatch_embedsclass_embedsrD   r>   s               rB   forwardz)InstructBlipVideoVisionEmbeddings.forwardr   s   '3'9$
Avu+28++LOO,O,O,OPP#++A..88A>>+22:q"EEHHVVYl;CCC
# 	9!%!>!>z6SX!Y!Y!%!8"4QQQ8L*//!:L:L8Laaa5O"P"S"ST`"a"aa
rC   F)__name__
__module____qualname__r'   r1   r7   Tensorintr^   FloatTensorboolrm   __classcell__rA   s   @rB   r)   r)   7   s        c< c c c c c c$&D5< &D &DUX &D]b]i &D &D &D &DP E$5 QU bgbn        rC   r)   c                   D    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg dZd ZdS ) InstructBlipVideoPreTrainedModelr*   blipT)"InstructBlipVideoQFormerEmbeddingsInstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                 l   | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 dS dS t          |t          j                  r#|j        j                            d|           dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    rPt          j                            |j        d|           t          j                            |j        d|           dS t          |t&          t(          f          r |j        j        
                                 dS dS )zInitialize the weights        )meanstdN      ?)r*   initializer_range
isinstancer   Linearr:   rc   datanormal_biaszero_	Embedding	LayerNormfill_r)   inittrunc_normal_r>   r9   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelquery_tokens)r@   modulefactors      rB   _init_weightsz.InstructBlipVideoPreTrainedModel._init_weights   s   .fry")455 	-M&&CV&<<<{& &&((((( '&-- 		-M&&CV&<<<<<-- 	-K""$$$M$$S))))) ABB 	-G!!&";#6!RRRG!!&"8s!OOOOO!JLb cdd 	-$**,,,,,	- 	-rC   N)ro   rp   rq   r%   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr    rC   rB   ry   ry      sl         ####&*#"&N!  - - - - -rC   ry   r   r   querykeyvalueattention_maskscalingdropoutc                 z   t          j        ||                    dd                    |z  }|||z   }t          j                            |d          }t          j                            ||| j                  }t          j        ||          }	|	                    dd                                          }	|	|fS )NrI   rN   )ptrainingr$   r    )	r7   matmulrf   r   rU   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             rB   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2(>>L=((6?([[L,|U33K''1--88::K$$rC   c                        e Zd ZdZ fdZdej        dedefdZ	 ddej        d	e	ej                 d
e
ej        e	ej                 e	e
ej                          f         fdZ xZS )r|   z=Multi-headed attention from 'Attention Is All You Need' paperc                 V   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        d| _
        |j        | _        t          j        | j        d| j        z  d          | _        |j        rWt          j        t#          j        | j                            }t          j        t#          j        | j                            }nd }d }|It#          j        |t#          j        |d          |f          }t          j        |          | j        _        t          j        | j        | j                  | _        d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )r   )requires_grad)r0   r1   r*   r2   r3   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   r   qkvqkv_biasr6   r7   zerosrX   
zeros_liker   
projection)r@   r*   q_biasv_biasr   rA   s        rB   r1   z#InstructBlipVideoAttention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
!'!9 9T^Q-?eLLL? 	\%+dn"="=>>F\%+dn"="=>>FFFFy&%*:6QV*W*W*WY_!`aaHL22DHM)DNDNCCrC   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr$   r    )rW   r   r   rf   r   )r@   r   r   r   s       rB   _shapez!InstructBlipVideoAttention._shape   s<    {{3GGQQRSUVWWbbdddrC   Nhidden_states	head_maskrG   c                 2   |                                 \  }}}|                     |          }|                    ||d| j        || j        z                                ddddd          }|d         |d         |d         }
}	}t
          }| j        j        dk    rt          | j        j                 } || ||	|
fd| j	        sdn| j
        | j        d	|\  }}|                    ||d
                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr	   r    r   r$      eagerNr   )r   r   r   rI   )rK   r   rS   r   rT   r   r*   _attn_implementationr   r   r   r   r   r   )r@   r   r   r   r   tgt_lenr3   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                 rB   rm   z"InstructBlipVideoAttention.forward   sC    #0"4"4"6"6WiHH]++	%%c7At~yTXTbGbcckkq!Q
 
	 2;1y|YWX\,j(?;+w66"9$+:Z"[$7$7		%

  #}HCC$2HJ	%
 	%
 	%
 	%
!\ "))#w;;FFHHook22L((rC   N)ro   rp   rq   __doc__r1   r7   rr   rs   r   r   tuplerm   rv   rw   s   @rB   r|   r|      s        GGD D D D D>eU\ eC ec e e e e -1$) $)|$) EL)$)
 
u|Xel3XeEL>Q5RR	S$) $) $) $) $) $) $) $)rC   r|   c                   B     e Zd Z fdZdej        dej        fdZ xZS )InstructBlipVideoMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )r0   r1   r*   r
   
hidden_actactivation_fnr   r   r2   intermediate_sizefc1fc2r?   s     rB   r1   zInstructBlipVideoMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJrC   r   rG   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   r@   r   s     rB   rm   zInstructBlipVideoMLP.forward  s=    //**=99//rC   ro   rp   rq   r1   r7   rr   rm   rv   rw   s   @rB   r   r     sc        K K K K KU\ el        rC   r   c            	       v     e Zd Zdef fdZedej        dej        dee	         dej
        fd            Z xZS )InstructBlipVideoEncoderLayerr*   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S Neps)r0   r1   r2   r3   r|   	self_attnr   r   layer_norm_epslayer_norm1r   mlplayer_norm2r?   s     rB   r1   z&InstructBlipVideoEncoderLayer.__init__  s    +3F;;<F<QRRR'//<F<QRRRrC   r   r   r   rG   c                     |}|                      |          } | j        d||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)r   r   r   )r   r   r   r   )r@   r   r   r   residualri   s         rB   rm   z%InstructBlipVideoEncoderLayer.forward$  s     !((77)4> 
'$
 
 
 
q
 &0 ((77//%0rC   )ro   rp   rq   r%   r1   r   r7   rr   r   r   rt   rm   rv   rw   s   @rB   r   r     s        S6 S S S S S S |  +,	
 
	   ^    rC   r   c            
            e Zd ZdZdef fdZe	 d	deej	                 de
e         deeef         fd            Z xZS )
InstructBlipVideoEncodera"  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipVideoEncoderLayer`].

    Args:
        config (`InstructBlipVideoConfig`):
            The corresponding vision configuration for the `InstructBlipVideoEncoder`.
    r*   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r   )r   ).0ri   r*   s     rB   
<listcomp>z5InstructBlipVideoEncoder.__init__.<locals>.<listcomp>J  s"    $t$t$tq%B6%J%J$t$t$trC   F)	r0   r1   r*   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr?   s    `rB   r1   z!InstructBlipVideoEncoder.__init__G  sb    m$t$t$t$tTYZ`ZrTsTs$t$t$tuu&+###rC   Nr   r   rG   c                 P    |}| j         D ]} ||fd|i|}t          |          S )Nr   last_hidden_state)r   r   )r@   inputs_embedsr   r   r   encoder_layers         rB   rm   z InstructBlipVideoEncoder.forwardM  sZ     &![ 	 	M)M -  MM ????rC   r   )ro   rp   rq   r   r%   r1   r   r   r7   rr   r   r   r   r   r   rm   rv   rw   s   @rB   r   r   =  s         ,6 , , , , , ,  26@ @ !.@ +,	@
 
uo%	&@ @ @ ^@ @ @ @ @rC   r   c                        e Zd ZU dZeed<   eedZdef fdZ	 e
d          e	 	 ddeej                 ded	ee         d
eeef         fd                        Zd Z xZS )InstructBlipVideoVisionModelr_   r*   )r   
attentionsc                    t                                          |           || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |                                  d S r   )r0   r1   r*   r2   r)   rD   r   encoderr   r   r   post_layernorm	post_init)r@   r*   r3   rA   s      rB   r1   z%InstructBlipVideoVisionModel.__init__g  sx       &	;FCC/77 l9&:OPPPrC   F)tie_last_hidden_statesNr^   r   rG   c                    |t          d          |                     ||          } | j        dd|i|}|j        }|                     |          }|d d dd d f         }|                     |          }t          ||          S )Nz You have to specify pixel_values)r^   r   r   r   pooler_outputr   )r   rD   r  r   r  r   )r@   r_   r^   r   r   encoder_outputsr   pooled_outputs           rB   rm   z$InstructBlipVideoVisionModel.forwardr  s     ?@@@Oghh+74< ,
 ,
',
,
 ,

 ,= //0ABB)!!!Q'2++M::)/'
 
 
 	
rC   c                     | j         S r   )rD   r@   s    rB   get_input_embeddingsz1InstructBlipVideoVisionModel.get_input_embeddings  s
    rC   )NF)ro   rp   rq   main_input_namer'   r   r   r|   _can_record_outputsr1   r   r   r   r7   rt   ru   r   r   r   r   r   rm   r  rv   rw   s   @rB   r   r   _  s         $O))))60 
	< 	 	 	 	 	 	 u555 59).
 
u01
 #'
 +,	

 
u00	1
 
 
 ^ 65
6      rC   r   c                   `     e Zd Zd fd	Zd Zd Zd Zd Zd Z	 	 	 	 dd	e	e
         fd
Z xZS )r}   Fc                    t                                                       || _        |j        |j        z  dk    r.t          |d          st          d|j        |j        fz            |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        |rIt          j        |j        | j	                  | _        t          j        |j        | j	                  | _        nHt          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j                  | _        t'          |dd          | _        | j        dk    s| j        dk    r6|j        | _        t          j        d|j        z  d	z
  | j                  | _        d
| _        d S )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr    r$   F)r0   r1   r*   r2   r   hasattrr   rs   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   getattrr  max_position_embeddingsr   distance_embeddingsave_attentionr@   r*   is_cross_attentionrA   s      rB   r1   z3InstructBlipVideoQFormerMultiHeadAttention.__init__  s    ::a??PVXhHiHi?^%v'ABC  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
 	Ky!;T=OPPDH6#=t?QRRDJJy!3T5GHHDH6#5t7IJJDJz&"EFF'.v7PR\']']$'>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD##rC   c                     || _         d S r   attn_gradients)r@   r%  s     rB   save_attn_gradientsz>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s    ,rC   c                     | j         S r   r$  r  s    rB   get_attn_gradientsz=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    ""rC   c                     || _         d S r   attention_map)r@   r+  s     rB   save_attention_mapz=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s    *rC   c                     | j         S r   r*  r  s    rB   get_attention_mapz<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!rC   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )NrI   r   r    r$   r	   )rK   r   r  rW   rT   )r@   xnew_x_shapes      rB   transpose_for_scoresz?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$rC   Nr   c                    |d u}|rS|                      |                     |                    }|                      |                     |                    }	|}nP|                      |                     |                    }|                      |                     |                    }	|                     |          }
|                      |
          }t	          j        ||                    dd                    }| j        dk    s| j        dk    r4|                                d         }t	          j	        |t          j
        |j                                      dd          }t	          j	        |t          j
        |j                                      dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt	          j        d||          }||z   }n?| j        dk    r4t	          j        d||          }t	          j        d	||          }||z   |z   }|t%          j        | j                  z  }|j        }|||z   } t+          j        d
          |                              |          }|r6| j        r/|                     |           |                    | j                   |                     |          }|||z  }t	          j        ||	          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }||fS )NrI   r   r  r  r$   rb   devicera   zbhld,lrd->bhlrzbhrd,lrd->bhlrrN   r   r    r	   )r2  r   r   r   r7   r   rf   r  rK   arangelongr5  rW   r  r  rd   rb   einsummathsqrtr  r   Softmaxr   r,  register_hookr&  r   rT   r   r  )r@   r   r   r   encoder_hidden_statesencoder_attention_maskr   r"  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                             rB   rm   z2InstructBlipVideoQFormerMultiHeadAttention.forward  sx    3$> 	O11$((;P2Q2QRRI33DJJ?T4U4UVVK3NN11$((=2I2IJJI33DJJ}4M4MNNK JJ}55//0ABB !<Y5H5HR5P5PQQ'>99T=Y]q=q=q&++--a0J"\*EJ}OcdddiijlnoppN"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ!1!7%/.@ -"*,,,-=>>AABXYY 	D$"5 	D##O444))$*BCCC #',,"?"?  &=	&I#%<kJJ%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDo--rC   rn   NNNN)ro   rp   rq   r1   r&  r(  r,  r.  r2  r   r   rm   rv   rw   s   @rB   r}   r}     s        $ $ $ $ $ $8- - -# # #+ + +" " "% % % "#I. I. +,I. I. I. I. I. I. I. I.rC   r}   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )r~   c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S r   )r0   r1   r   r   r2   denser   r   r  hidden_dropout_probr   r?   s     rB   r1   z+InstructBlipVideoQFormerSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==rC   r   input_tensorrG   c                     |                      |          }|                     |          }|                     ||z             }|S r   rT  r   r   r@   r   rV  s      rB   rm   z*InstructBlipVideoQFormerSelfOutput.forward  @    

=11]33}|'CDDrC   r   rw   s   @rB   r~   r~     i        > > > > >U\  RWR^        rC   r~   c                        e Zd Zd fd	Zd Z	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
e	e
         dej        fdZ xZS )!InstructBlipVideoQFormerAttentionFc                     t                                                       t          ||          | _        t	          |          | _        t                      | _        d S r   )r0   r1   r}   	attentionr~   outputsetpruned_headsr!  s      rB   r1   z*InstructBlipVideoQFormerAttention.__init__  sL    CFL^__8@@EErC   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r$   rN   )lenr   r_  r   r  rb  r   r   r   r   r`  rT  r  union)r@   headsindexs      rB   prune_headsz-InstructBlipVideoQFormerAttention.prune_heads"  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::rC   Nr   r   r   r=  r>  r   rG   c           	      ^     | j         d|||||d|\  }}|                     ||          }	|	S )N)r   r   r   r=  r>  r   )r_  r`  )
r@   r   r   r   r=  r>  r   r   ri   attention_outputs
             rB   rm   z)InstructBlipVideoQFormerAttention.forward4  sZ     ( 
')"7#9
 
 
 
Q  ;;{MBBrC   rn   rQ  )ro   rp   rq   r1   rh  r7   rr   r   rt   r   r   rm   rv   rw   s   @rB   r]  r]    s        " " " " " "; ; ;* 7;15=A>B   |  !!23  E-.	 
  ((9:  !)): ;  +,  
               rC   r]  c                   B     e Zd Z fdZdej        dej        fdZ xZS )$InstructBlipVideoQFormerIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r0   r1   r   r   r2   r   rT  r   r   strr
   intermediate_act_fnr?   s     rB   r1   z-InstructBlipVideoQFormerIntermediate.__init__J  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$rC   r   rG   c                 Z    |                      |          }|                     |          }|S r   )rT  ro  r   s     rB   rm   z,InstructBlipVideoQFormerIntermediate.forwardR  s,    

=1100??rC   r   rw   s   @rB   rl  rl  I  s^        9 9 9 9 9U\ el        rC   rl  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )InstructBlipVideoQFormerOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r0   r1   r   r   r   r2   rT  r   r   r  rU  r   r?   s     rB   r1   z'InstructBlipVideoQFormerOutput.__init__Y  sf    Yv79KLL
f&8f>STTTz&"<==rC   r   rV  rG   c                     |                      |          }|                     |          }|                     ||z             }|S r   rX  rY  s      rB   rm   z&InstructBlipVideoQFormerOutput.forward_  rZ  rC   r   rw   s   @rB   rr  rr  X  r[  rC   rr  c                   N     e Zd Z fdZ	 	 	 	 	 ddee         fdZd Zd Z xZ	S )	InstructBlipVideoQFormerLayerc                    t                                                       |j        | _        d| _        t	          |          | _        || _        ||j        z  dk    rt	          |d          | _        d| _	        nd| _	        t          |          | _        t          |          | _        t          |          | _        t          |          | _        d S )Nr$   r   T)r"  F)r0   r1   chunk_size_feed_forwardseq_len_dimr]  r_  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionrl  intermediaterr  r`  intermediate_queryoutput_query)r@   r*   rz  rA   s      rB   r1   z&InstructBlipVideoQFormerLayer.__init__g  s    '-'E$:6BB"v771<<"CF_c"d"d"dD'+D$$',D$@HH4V<<"Fv"N"N:6BBrC   Nr   r   c           
          | j         |f||d|}|dk    r|d d d |d d f         }	| j        r$|t          d           | j        |	f||||d|}	t	          | j        | j        | j        |	          }
|j        d         |k    r`t	          | j	        | j        | j        |d d |d d d f                   
                    |
j                  }t          j        |
|gd          }
n!t	          | j	        | j        | j        |          }
|
S )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   r=  r>  r$   rN   )r_  r}  r   r|  r   feed_forward_chunk_queryrx  ry  rP   feed_forward_chunkrd   r5  r7   rX   )r@   r   r   r   r=  r>  query_lengthr   rj  query_attention_outputlayer_outputlayer_output_texts               rB   rm   z%InstructBlipVideoQFormerLayer.forward{  s    *4>
)
 
 	
 
 !%5aaa,6I%J"' 
(0$%efff)<)<**#1'*?+A* * * *& 5-, &	 L  %a(<77$=+0$$QQQqqq%89	% %
 "\()) "  %y,8I)JPQRRR4',  	 L rC   c                 \    |                      |          }|                     ||          }|S r   )r~  r`  r@   rj  intermediate_outputr  s       rB   r  z0InstructBlipVideoQFormerLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIrC   c                 \    |                      |          }|                     ||          }|S r   )r  r  r  s       rB   r  z6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s4    "556FGG(()<>NOOrC   NNNNr   )
ro   rp   rq   r1   r   r   rm   r  r  rv   rw   s   @rB   rv  rv  f  s        C C C C C. "#6 6 +,6 6 6 6p  
      rC   rv  c                   R     e Zd Z fdZe	 	 	 	 	 ddee         fd            Z xZS )InstructBlipVideoQFormerEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 0    g | ]}t          |          S r   )rv  )r   rz  r*   s     rB   r   z<InstructBlipVideoQFormerEncoder.__init__.<locals>.<listcomp>  s$    ooo)*69==ooorC   F)	r0   r1   r*   r   r   r   r   layerr   r?   s    `rB   r1   z(InstructBlipVideoQFormerEncoder.__init__  sg    ]oooouU[UmOnOnooo
 

 ',###rC   Nr   r   c                     t          | j        j                  D ]*}| j        |         }	|||         nd }
 |	|||
|f||d|}+t	          |          S )N)r>  r  r   )r   r*   r   r  r   )r@   r   r   r   r=  r>  r  r   ilayer_modulelayer_head_masks              rB   rm   z'InstructBlipVideoQFormerEncoder.forward  s     t{455 	 	A:a=L.7.CillO(L%	
 (>)   MM 9+
 
 
 	
rC   r  )	ro   rp   rq   r1   r   r   r   rm   rv   rw   s   @rB   r  r    s        , , , , ,  "#
 
 +,
 
 
 
 
 
 
 
rC   r  c                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )r{   z;Construct the embeddings from word and position embeddings.c                 *   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _        t          j        |j                  | _        |                     dt#          j        |j                                      d          d           t)          |dd          | _        || _        d S )	N)padding_idxr   position_ids)r$   rI   F)
persistentr  r  )r0   r1   r   r   
vocab_sizer2   pad_token_idword_embeddingsr  position_embeddingsr   r   	layernormr  rU  r   register_bufferr7   r6  rg   r  r  r*   r?   s     rB   r1   z+InstructBlipVideoQFormerEmbeddings.__init__  s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$rC   Nr   c                    ||                                 d         }nd}|(| j        d d |||z   f                                         }|m|                     |          }| j        dk    r2|                     |                    |j                            }||z   }|t          j	        ||fd          }n|}|                    | j
        j        j                  }| 
                    |          }|                     |          }|S )Nr$   r   r  rN   )rK   r  cloner  r  r  rd   r5  r7   rX   r  rc   rb   r   )r@   	input_idsr  query_embedspast_key_values_lengthrD  rD   r  s           rB   rm   z*InstructBlipVideoQFormerEmbeddings.forward  s     "))!,JJJ,QQQ0FVlIl0l-lmssuuL --i88J+z99&*&>&>|zO`?a?a&b&b#'*==
'"Yj'AqIII
%J]]4>#8#>??
^^J//
\\*--
rC   )NNNr   )ro   rp   rq   r   r1   rm   rv   rw   s   @rB   r{   r{     s`        EE    $         rC   r{   c                       e Zd ZdZdZdZdZdZe e	e
dd          g e	e
dd          gdZdef fd	Zd
 Zd Zd Z	 ddej        dee         dej        dedej        f
dZee	 	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 deej                 deej                 dee         deeej                 e f         fd                        Z! xZ"S )InstructBlipVideoQFormerModelz
    Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr$   z
.attention)rg  
layer_namez.crossattention)r   r   cross_attentionsr*   c                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r   )r0   r1   r*   r{   rD   r  r  r  r?   s     rB   r1   z&InstructBlipVideoQFormerModel.__init__-  sV       <VDD6v>>rC   c                     | j         j        S r   rD   r  r  s    rB   r  z2InstructBlipVideoQFormerModel.get_input_embeddings7  s    ..rC   c                     || j         _        d S r   r  r@   r   s     rB   set_input_embeddingsz2InstructBlipVideoQFormerModel.set_input_embeddings:  s    */'''rC   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r_  rh  )r@   heads_to_pruner  rf  s       rB   _prune_headsz*InstructBlipVideoQFormerModel._prune_heads=  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	CrC   r   input_shaper5  	has_queryrG   c                 *   |                                 dk    r|dddddddf         }nD|                                 dk    r|ddddddf         }nt          d| d|j         d          |                    | j                  }d|z
  d	z  }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r	   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )ra   r   g     )rO   r   rP   rd   rb   )r@   r   r  r5  r  extended_attention_masks         rB   get_extended_attention_maskz9InstructBlipVideoQFormerModel.get_extended_attention_maskE  s    . 1$$&4QQQaaa]&C##!!Q&& '5QQQdAAA5E&F##sKss\j\psss   #:"<"<4:"<"N"N#&)@#@H"L&&rC   Nr  r  r  r   r=  r>  r   c           	      \    ||t          d          ||j        d         nd}	                     |||          }
|
                                dd         }|\  }}|
j        }|t          j        ||f|          }                     |||          }|t          |t                    r|d                                         \  }}}n|                                \  }}}||f}t          |t                    r fd|D             }nF|,t          j        ||          } 
                    |          }n 
                    |          }nd}                     | j        j                  }  j        |
f|||||	d	|}|j        }|dddddf         }t!          ||
          S )a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner$   r   )r  r  r  rI   )r5  c                 :    g | ]}                     |          S r   )invert_attention_mask)r   maskr@   s     rB   r   z9InstructBlipVideoQFormerModel.forward.<locals>.<listcomp>  s(    2w2w2wX\43M3Md3S3S2w2w2wrC   )r   r   r=  r>  r  r  )r   rP   rD   rK   r5  r7   onesr  r   listr  get_head_maskr*   r   r  r   r   )r@   r  r   r  r  r   r=  r>  r   r  embedding_outputr  rh   rD  r5  r  encoder_batch_sizeencoder_sequence_lengthri   encoder_hidden_shapeencoder_extended_attention_maskr	  sequence_outputr
  s   `                       rB   rm   z%InstructBlipVideoQFormerModel.forwardp  s>   $ !5VWWW0<0H|)!,,a??%% + 
 
 '++--crc2!,
J!(!"Z*j)A6RRRN #'"B"B>S^`f"g"g !,/66 ^AVWXAYA^A^A`A`>"$;QQAVA[A[A]A]>"$;Q$68O#P 0$77 e2w2w2w2w`v2w2w2w//'/).4HQW)X)X)X&262L2LMc2d2d//262L2LMc2d2d//.2+ &&y$+2OPP	+74<,
2"7#B%,
 ,
 ,
 ,
 *;'1aaa0;-'
 
 
 	
rC   rn   )NNNNNN)#ro   rp   rq   r   r   r   r   r   rv  r   r}   r  r&   r1   r  r  r  r7   rr   r   rs   r5  ru   r  r   r   
LongTensorr   rt   r   r   r   r   rm   rv   rw   s   @rB   r  r    s4        
 #( N 7NEQ[ghhh
 NEQ[lmmm
 =      / / /0 0 0C C C  )' )')' 3Z)' 	)'
 )' 
)' )' )' )'V  7;37/315=A>BO
 O
#O
 !!23O
 u/0	O

 u|,O
 E-.O
  ((9:O
 !)): ;O
 +,O
 
uU&')UU	VO
 O
 O
 ^ O
 O
 O
 O
 O
rC   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                      e Zd ZU dZdZeeej                          e	d<   dZ
eeej                          e	d<   dZeej                 e	d<   dZeeej                          e	d<   dZeeej                          e	d<   dee         fd	ZdS )
4InstructBlipVideoForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrG   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS )r  r  r  N)r  to_tuple)r   kr@   s     rB   	<genexpr>zPInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>  sf       
 
  WWW GGq!!**,,
 
 
 
 
 
rC   )r   keysr  s   `rB   r  z=InstructBlipVideoForConditionalGenerationModelOutput.to_tuple  sE     
 
 
 
 YY[[	
 
 
 
 
 	
rC   )ro   rp   rq   r   r  r   r   r7   rt   r   r  r  r  r  r   r  r   rC   rB   r  r    s           04D(5*+
,33315FHU5,-.55526NHU./666:>OXeE$567>>>AEHU5+<%=>EEE
%* 
 
 
 
 
 
rC   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c            #           e Zd ZdZdgZdef fdZd Zd Zd Z	d Z
d	ej        d
ej        fdZee	 	 	 	 	 	 	 	 	 	 	 ddej        dej        deej                 d	eej                 deej                 deej                 deej                 d
eej                 dee         dee         dee         dedee         dee         deeef         fd                        Z xZS )r   r_   r   r*   c                    t                                          |           t          |j                  | _        t          j        t          j        d|j	        |j
        j                            | _        t          |j
                  | _        t          j        |j
        j        |j        j                  | _        t%          j        |j                  | _        | j        j        $| j                            | j        j                   | j        j        $| j                            | j        j                   |                                  d S Nr$   )r0   r1   r   vision_configvision_modelr   r6   r7   r   num_query_tokensqformer_configr2   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr   extend_keep_in_fp32_modulesr  r?   s     rB   r1   zInstructBlipVideoModel.__init__  s      89MNNLQ8OQWQfQr)s)stt4V5JKK#%9V-B-NPVPbPn#o#o '3F4FGG0<"))$*=*OPPP4@&--d.A.WXXX 	rC   c                 4    | j                                         S r   r  r  r  s    rB   r  z+InstructBlipVideoModel.get_input_embeddings      "77999rC   c                 :    | j                             |           d S r   r  r  r  s     rB   r  z+InstructBlipVideoModel.set_input_embeddings      0077777rC   c                     | j         j        s8| j        j        | j        j        _        | j        j        | j        j        _        d S d S r   r*   use_decoder_only_language_modelr  sharedr  embed_tokensdecoderr  s    rB   _tie_weightsz#InstructBlipVideoModel._tie_weights	  J    {: 	R7;7J7QD'47;7J7QD'444	R 	RrC   c                 
   | j         }t          |          dk    r@d|vr<t          j                                        dk    rt
                              d           t          | j        d          rd| j        j	        _
        dS dS z
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maprd  r7   cudadevice_countloggerwarningr  r  r  io_same_devicer@   r  s     rB   _preprocess_acceleratez-InstructBlipVideoModel._preprocess_accelerate      
 *}!!&6m&K&KPUPZPgPgPiPilmPmPmNNM   4&
33 	?:>D(777	? 	?rC   r  r   c                 t   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                    d          	                    |          
                    |j                  }|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        Nr4  rI   )r  r7   r   r*   image_token_idr7  r5  all	unsqueeze	expand_asrd   r@   r  r   special_image_masks       rB   get_placeholder_maskz+InstructBlipVideoModel.get_placeholder_mask"       !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H/99"==GGVVYYZgZnoo!!rC   NFqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr^   	use_cacher   rG   c                    ||n| j         j        }|j        \  }}}}}|                    ||z  |||          }|                     ||	|
||          }|d         }t          j        |                                dd         t
          j        |j	                  }| j
                            |j        d         dd          }t          j        |                                dd         t
          j        |j	                  }|t          j        |          }|                    |d          }|                    |d          }t          j        ||gd          }|                     ||||||	|
|          }|d         ddd|                    d          ddf         }|                     |          }|                    || j         j        |z  d          }|I | j                                        |          }|| j         j        k    }|t          j        |          }nd| |                                 t          j        | j         j        t
          j        |j	                            k    }|                    d          }|                    d                              |                              |j	                  }|                    |j	        |j                  }|                    ||          }| j         j        r | j        d|||	|
||d	|}n | j        d|||||	|
||d
|}t;          |||          S )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        N)r_   r  r  r  r^   r   rI   r4  rN   r$   )r  r   r  r=  r>  r  r  r  r   r   r  r  r  r  )r   r   r  r  r  r  r  r  r  r   )r*   use_return_dictrP   rS   r  r7   r  rK   r7  r5  r   rg   	ones_likerepeat_interleaverX   r  r  r  r  r  video_token_idr   r  r  r  rd   rb   masked_scatterr  r  )r@   r_   r
  r  r  r   r  r  r   r  r  r  r^   r  r   rh   frameschannelrE   rF   r  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                                 rB   rm   zInstructBlipVideoModel.forward1  s   R &1%<kk$+B] 6B5G2
FGVU#++J,?&RWXX**%/!5#%= + 
 
 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"-??A?NN!7!I!I&VW!I!X!X!&,@BX+Y_`!a!a!a'1%".#7/!5# % 	
 	
 %Q'+A\->->q-A-A+A111(DE !% 8 8 F F !6 = =j$+JfioJoqs t t FD/DDFFyQQM!*dk.H!H%!&!;!;!.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;/99"==GGVVYYZgZnoo 5 8 89M}Ob c c%445GI^__;6 	)d) +-"3%9'#   GG *d) 
+-"3'="3%9'#
 
 
 
G D))#*
 
 
 	
rC   )NNNNNNNNNFN)ro   rp   rq   r  r  r%   r1   r  r  r  r  r7   r  rt   r  r   r   r   rr   ru   r   r   r   r   r  rm   rv   rw   s   @rB   r   r     s        %O+,6      &: : :8 8 8R R R
? ? ?("e.> "uO` " " " " 
 >B15598<=A04,0/3&*).$(
 
'
 !,
 !))9 :	

 E-.
 !!12
 $E$45
 !))9 :
  -
 $D>
 'tn
 d^
 #'
 D>
 -.
  
uJJ	K!
 
 
 ^ 
 
 
 
 
rC   r   a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c            %           e Zd ZU eed<   dZdZdgZdef fdZd Z	d Z
d Zd	ej        fd
Zd Zd Zd Zd Z	 	 	 d$dej        dej        deej                 dee         dee         f
dZdej        dej        fdZee	 	 	 	 	 	 	 	 	 	 	 	 d%dej        dej        deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         deej                 dee         dedee         d ee         d	eee f         f d!                        Z! ej"                    	 	 	 	 	 	 d&dej        deej                 deej                 deej                 deej                 deej                 ded	ej        fd"            Z#	 	 	 d$dej        dej        deej                 dee         dee         f
d#Z$ xZ%S )'r   r*   r_   Tr   c                    t                                          |           t                              |j                  | _        t          j        t          j	        d|j
        |j        j                            | _        t                              |j                  | _        t          j        |j        j        |j        j                  | _        |j        rt)          j        |j                  }nt-          j        |j                  }|j        | j                            |j                   |j        | j                            |j                   || _        |                                  d S r  )r0   r1   r   _from_configr  r  r   r6   r7   r   r  r  r2   r   r  r  r   r  r  r  r"   r  r#   r   r  r  r  r  )r@   r*   r  rA   s      rB   r1   z2InstructBlipVideoForConditionalGeneration.__init__  s1      8EEfFZ[[LQ8OQWQfQr)s)stt4AA&BWXX#%9V-B-NPVPbPn#o#o 1 	S1=f>PQQNN2>v?QRRN+7")).*JKKK/;&--n.RSSS, 	rC   c                 4    | j                                         S r   r  r  s    rB   r  z>InstructBlipVideoForConditionalGeneration.get_input_embeddings  r  rC   c                 :    | j                             |           d S r   r  r  s     rB   r  z>InstructBlipVideoForConditionalGeneration.set_input_embeddings  r  rC   c                 :    | j                             |           d S r   )r  set_output_embeddings)r@   new_embeddingss     rB   r(  z?InstructBlipVideoForConditionalGeneration.set_output_embeddings  s    11.AAAAArC   rG   c                 4    | j                                         S r   )r  get_output_embeddingsr  s    rB   r+  z?InstructBlipVideoForConditionalGeneration.get_output_embeddings  s    "88:::rC   c                 4    | j                                         S r   )r  get_encoderr  s    rB   r-  z5InstructBlipVideoForConditionalGeneration.get_encoder      "..000rC   c                 4    | j                                         S r   )r  get_decoderr  s    rB   r0  z5InstructBlipVideoForConditionalGeneration.get_decoder  r.  rC   c                     | j         j        s8| j        j        | j        j        _        | j        j        | j        j        _        d S d S r   r  r  s    rB   r  z6InstructBlipVideoForConditionalGeneration._tie_weights  r  rC   c                 
   | j         }t          |          dk    r@d|vr<t          j                                        dk    rt
                              d           t          | j        d          rd| j        j	        _
        dS dS r  r  r  s     rB   r  z@InstructBlipVideoForConditionalGeneration._preprocess_accelerate  r  rC   NFr
  r  r^   r  c                     dS )$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        Nr   )r@   r_   r
  r  r^   r  s         rB   get_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_features
  s	     	rC   r  r   c                 t   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                    d          	                    |          
                    |j                  }|S r  )r  r7   r   r*   r  r7  r5  r  r  r  rd   r  s       rB   r  z>InstructBlipVideoForConditionalGeneration.get_placeholder_mask  r	  rC   r   r  r  r  r  labelsr  r   c                    ||n| j         j        }|                     ||||d          \  }}}|s|                                n|}|s|                                n|}| |                                 |          }|t          j        |          }|                    |j        |j	                  }| 
                    ||          }|                    ||          }| j         j        rJ | j        d|||	|
||d|}|r|j        n|d         }d}|  | j        d||| j         j        j        d|}n9 | j        d|||||	|
|||d	|}|r|j        n|d         }|r|j        n|d	         }t'          |||||
          S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTr
  r  r^   r  r   r  r   )r  r7  r  )	r   r   r  r  r  r  r  r7  r  r$   )r  r  r  r  r  r   )r*   r  get_video_featuresr  r  r7   r  rd   r5  rb   r  r  r  r  r  loss_functionr  r  r  r  )r@   r_   r
  r  r  r   r  r  r   r  r  r7  r  r^   r  r   r   r  r  r  r!  r  r  s                          rB   rm   z1InstructBlipVideoForConditionalGeneration.forward*  s=   f &1%<kk$+B]?C?V?V/#9%= @W @
 @
<~} ;FY00222>8CV..000 7D5577	BBM!"_Y77N 5 8 89M}Ob c c!66yP]6^^%445GI^__;6 	C)d) +-"3%9'#   G (3BW^^
FD!)t) !&T[=T=_ ci 
 *d) +-"3'="3%9'#   G $/>7<<GAJD'2BW^^
FC))#*
 
 
 	
rC   c                    t          | d          r|                                  |j        d         }	|                     ||||d          \  }
}}||o| j        j        g| j        j        z  dz  }|| j        j        j        gz   }t          j
        |gt          j        |j                  }|                    |	d          } |                                 |          }|t          j        |          }|
                    |j        |j                  }
|                     ||	          }|                    ||
          }||d
}| j        j        j        s||d<    | j        j        di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr9  Nr   r4  r$   r:  )r   r   r  r   )r  r  rP   r;  r*   video_token_indexr  r  bos_token_idr7   r   r7  r5  repeatr  r  rd   rb   r  r  r  is_encoder_decodergenerate)r@   r_   r
  r  r  r   r   r^   generate_kwargsrh   r   r  r  video_tokensstart_tokensr  inputsr!  s                     rB   rB  z2InstructBlipVideoForConditionalGeneration.generate  s   D 4)) 	*'')))!'*
?C?V?V/#9%= @W @
 @
<~}    $ =>A]]`aa+t{/F/S.TT!L,uzR^Refff	%,,Z;;	7D5577	BBM!"_Y77N 5 8 89M}Ob c c!66yP]6^^%445GI^__#0NSS")< 	,"+F;.$%.KKK?KKrC   c                    |j         \  }}}}	}
|                    ||z  ||	|
          }|                     ||d          }|d         }t          j        |                                dd         t          j        |j                  }| j        	                    |j         d         dd          }t          j        |                                dd         t          j        |j                  }|t          j
        |          }|                    |d          }|                    |d          }t          j        ||gd          }|                     |||||d	          }|d         ddd|                    d          ddf         }|                     |          }|                    || j        j        |z  d          }|r|||fS |S )
r4  T)r_   r^   r  r   NrI   r4  rN   r$   )r  r   r  r=  r>  r  )rP   rS   r  r7   r  rK   r7  r5  r   rg   r  r  rX   r  r  r*   r  )r@   r_   r
  r  r^   r  rh   r  r  rE   rF   r  r  r  r   r  r  r  r   s                      rB   r;  z<InstructBlipVideoForConditionalGeneration.get_video_features  s   " 6B5G2
FGVU#++J,?&RWXX**%%= + 
 

 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"-??A?NN!7!I!I&VW!I!X!X!&,@BX+Y_`!a!a!a'1%".#7 % 
 
 %Q'+A\->->q-A-A+A111(DE !% 8 8 F F !6 = =j$+JfioJoqs t t 	H(.-GG$$rC   )NFF)NNNNNNNNNNFN)NNNNNF)&ro   rp   rq   r%   r   r  r   r  r1   r  r  r(  r   Moduler+  r-  r0  r  r  r7   rt   r  r   ru   r5  r  r   r   r   r   r   r   r  rm   no_gradrB  r;  rv   rw   s   @rB   r   r     s(         $###$O!+,6      4: : :8 8 8B B B;ry ; ; ; ;1 1 11 1 1R R R
? ? ?0 >B38&+ ' !+ !))9 :	
 #+4. d^   ""e.> "uO` " " " " 
 >B15598<=A59,0/3-1&*).$(N
 N
'N
 !,N
 !))9 :	N

 E-.N
 !!12N
 $E$45N
 !))9 :N
   12N
 $D>N
 'tnN
 )*N
 d^N
 #'N
 D>N
  +,!N
" 
uJJ	K#N
 N
 N
 ^ N
` U]__ 9==A045959).C C'C $E$45C !))9 :	C
 E,-C !!12C   12C #'C 
	C C C _CR >B38&+9% 9%'9% !+9% !))9 :	9%
 #+4.9% d^9% 9% 9% 9% 9% 9% 9% 9%rC   r   )r   ry   r  r   r   )r   )Mr9  dataclassesr   typingr   r   r   r   r7   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor!   r"   r#   configuration_instructblipvideor%   r&   r'   
get_loggerro   r  rH  r)   ry   rr   floatr   r|   r   r   r   r   r}   r~   r]  rl  rr  rv  r  r{   r  r  r   r   __all__r   rC   rB   <module>r[     sp  ,  ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1        ! ! ! ! ! ! ) ) ) ) ) ) B B B B B B 9 9 9 9 9 9            G F F F F F F F & & & & & & l l l l l l l l l l j j j j j j j j j j j j j j j j ? ? ? ? ? ? ? ? I I I I I I I I I I          
	H	%	%G G G G G	 G G GT #- #- #- #- #- #- #- #-\ % %I%<% 
% <	%
 U\*% % % % % %.I) I) I) I) I) I) I) I)X    29       $>   D@ @ @ @ @ry @ @ @D1 1 1 1 1#C 1 1 1hw. w. w. w. w. w. w. w.t       +  +  +  +  + 	 +  +  + \    29       RY   U U U U U$> U U Up$
 $
 $
 $
 $
bi $
 $
 $
N0 0 0 0 0 0 0 0fi
 i
 i
 i
 i
$D i
 i
 i
X   

 
 
 
 
; 
 
  
:   
F
 F
 F
 F
 F
= F
 F
 
F
R   }% }% }% }% }%0PRa }% }% }%@  rC   