
     `iD                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2  e&j3        e4          Z5e e$d           G d de"                                  Z6 G d de
j7                  Z8	 dIde
j7        de	j9        de	j9        de	j9        d ee	j9                 d!e:d"e:fd#Z; G d$ d%e
j7                  Z< G d& d'e
j7                  Z= G d( d)e          Z>e$ G d* d+e                      Z? G d, d-e
j7                  Z@ G d. d/e?          ZA G d0 d1e
j7                  ZB G d2 d3e
j7                  ZC G d4 d5e
j7                  ZD G d6 d7e
j7                  ZE G d8 d9e
j7                  ZF G d: d;e          ZG G d< d=e
j7                  ZH G d> d?e
j7                  ZI G d@ dAe?          ZJ e$dB           G dC dDe?                      ZK e$dE           G dF dGe?e                      ZLg dHZMdS )JzPyTorch InstructBLIP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipConfigInstructBlipQFormerConfigInstructBlipVisionConfigzQ
    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
    )custom_introc                      e Zd ZU dZdZeeej                          e	d<   dZ
eeej                          e	d<   dZeej                 e	d<   dZeeej                          e	d<   dZeeej                          e	d<   dee         fd	ZdS )
/InstructBlipForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsreturnc                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS )r-   r.   r/   N)getattrto_tuple).0kselfs     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/instructblip/modeling_instructblip.py	<genexpr>zKInstructBlipForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>J   sf       
 
  WWW GGq!!**,,
 
 
 
 
 
    )tuplekeysr8   s   `r9   r5   z8InstructBlipForConditionalGenerationModelOutput.to_tupleI   sE     
 
 
 
 YY[[	
 
 
 
 
 	
r;   )__name__
__module____qualname____doc__r+   r   r<   torchFloatTensor__annotations__r,   r-   r.   r/   r   r5    r;   r9   r*   r*   .   s           04D(5*+
,33315FHU5,-.55526NHU./666:>OXeE$567>>>AEHU5+<%=>EEE
%* 
 
 
 
 
 
r;   r*   c                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej	        d
e
dej        fdZ xZS )InstructBlipVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        dd| j                            | _        t          j        d| j        | j        | j                  | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        t          j
        d| j        | j                            | _        d S )Nr$   r	   )in_channelsout_channelskernel_sizestrider    )super__init__rI   hidden_size	embed_dim
image_size
patch_sizer   	ParameterrC   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr8   rI   	__class__s     r9   rP   z%InstructBlipVisionEmbeddings.__init__T   s    + + +!|EK1dn,M,MNN!yDO\`\k 
  
  
 !Ot>1D!-1"$,u{1d>PRVR`/a/a"b"br;   
embeddingsheightwidthr0   c                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r$   Ng      ?r   r	   r    bicubicF)sizemodealign_cornersdim)shaper\   rC   jit
is_tracingrT   r   reshapepermuter   
functionalinterpolateviewcat)r8   r_   r`   ra   rZ   r[   class_pos_embedpatch_pos_embedri   
new_height	new_widthsqrt_num_positionss               r9   interpolate_pos_encodingz5InstructBlipVisionEmbeddings.interpolate_pos_encodingf   sr    !&q)A-/5a81< y##%% 	++*F*F6UZ??**1!!!RaR%81!!!QRR%8r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr;   Fpixel_valuesrx   c                 (   |j         \  }}}}| j        j        j        }|                     |                    |                    }|                    d                              dd          }| j                            |dd                              |          }	t          j
        |	|gd          }
|r|                     |
||          }n| j        }|
|d d d |
                    d          d d f                             |          z   }
|
S )Ndtyper    r$   rc   rh   )rj   rY   weightr|   toflatten	transposerW   expandrC   rr   rx   r\   re   )r8   ry   rx   
batch_size_r`   ra   target_dtypepatch_embedsclass_embedsr_   r\   s               r9   forwardz$InstructBlipVisionEmbeddings.forward   s   '3'9$
Avu+28++LOO,O,O,OPP#++A..88A>>+22:q"EEHHVVYl;CCC
# 	9!%!>!>z6SX!Y!Y!%!8"4QQQ8L*//!:L:L8Laaa5O"P"S"ST`"a"aa
r;   F)r?   r@   rA   r'   rP   rC   Tensorintrx   rD   boolr   __classcell__r^   s   @r9   rH   rH   S   s        c7 c c c c c c$&D5< &D &DUX &D]b]i &D &D &D &DP E$5 QU bgbn        r;   rH           modulequerykeyvalueattention_maskscalingdropoutc                 z   t          j        ||                    dd                    |z  }|||z   }t          j                            |d          }t          j                            ||| j                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nrc   rh   )ptrainingr$   r    )	rC   matmulr   r   ro   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r9   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2(>>L=((6?([[L,|U33K''1--88::K$$r;   c                        e Zd ZdZ fdZdej        dedefdZ	 ddej        d	e	ej                 d
e
ej        e	ej                 e	e
ej                          f         fdZ xZS )InstructBlipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 V   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        d| _
        |j        | _        t          j        | j        d| j        z  d          | _        |j        rWt          j        t#          j        | j                            }t          j        t#          j        | j                            }nd }d }|It#          j        |t#          j        |d          |f          }t          j        |          | j        _        t          j        | j        | j                  | _        d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )bias)requires_grad)rO   rP   rI   rQ   rR   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasrU   rC   zerosrr   
zeros_liker   
projection)r8   rI   q_biasv_biasr   r^   s        r9   rP   zInstructBlipAttention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
!'!9 9T^Q-?eLLL? 	\%+dn"="=>>F\%+dn"="=>>FFFFy&%*:6QV*W*W*WY_!`aaHL22DHM)DNDNCCr;   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr$   r    )rq   r   r   r   r   )r8   r   r   r   s       r9   _shapezInstructBlipAttention._shape   s<    {{3GGQQRSUVWWbbdddr;   Nhidden_states	head_maskr0   c                 2   |                                 \  }}}|                     |          }|                    ||d| j        || j        z                                ddddd          }|d         |d         |d         }
}	}t
          }| j        j        dk    rt          | j        j                 } || ||	|
fd| j	        sdn| j
        | j        d	|\  }}|                    ||d
                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr	   r    r   r$      eagerNr   )r   r   r   rc   )re   r   rm   r   rn   r   rI   _attn_implementationr   r   r   r   r   r   )r8   r   r   r   r   tgt_lenrR   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                 r9   r   zInstructBlipAttention.forward   sC    #0"4"4"6"6WiHH]++	%%c7At~yTXTbGbcckkq!Q
 
	 2;1y|YWX\,j(?;+w66"9$+:Z"[$7$7		%

  #}HCC$2HJ	%
 	%
 	%
 	%
!\ "))#w;;FFHHook22L((r;   N)r?   r@   rA   rB   rP   rC   r   r   r   r   r<   r   r   r   s   @r9   r   r      s        GGD D D D D>eU\ eC ec e e e e -1$) $)|$) EL)$)
 
u|Xel3XeEL>Q5RR	S$) $) $) $) $) $) $) $)r;   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )InstructBlipMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )rO   rP   rI   r
   
hidden_actactivation_fnr   r   rQ   intermediate_sizefc1fc2r]   s     r9   rP   zInstructBlipMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr;   r   r0   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   r8   r   s     r9   r   zInstructBlipMLP.forward  s=    //**=99//r;   r?   r@   rA   rP   rC   r   r   r   r   s   @r9   r   r     sc        K K K K KU\ el        r;   r   c            	       v     e Zd Zdef fdZedej        dej        dee	         dej
        fd            Z xZS )InstructBlipEncoderLayerrI   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S Neps)rO   rP   rQ   rR   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r]   s     r9   rP   z!InstructBlipEncoderLayer.__init__  s    +.v66<F<QRRR"6**<F<QRRRr;   r   r   r   r0   c                     |}|                      |          } | j        d||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)r   r   rF   )r   r   r   r   )r8   r   r   r   residualr   s         r9   r   z InstructBlipEncoderLayer.forward  s     !((77)4> 
'$
 
 
 
q
 &0 ((77//%0r;   )r?   r@   rA   r%   rP   r   rC   r   r   r   rD   r   r   r   s   @r9   r   r     s        S1 S S S S S S |  +,	
 
	   ^    r;   r   c                   D    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg dZd ZdS )InstructBlipPreTrainedModelrI   blipT)InstructBlipQFormerEmbeddingsr   %InstructBlipQFormerMultiHeadAttentionInstructBlipQFormerSelfOutputc                 l   | j         j        }t          |t          j        t          j        f          rJ|j        j                            d|           |j	         |j	        j        
                                 dS dS t          |t          j                  r#|j        j                            d|           dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    rPt          j                            |j        d|           t          j                            |j        d|           dS t          |t&          t(          f          r |j        j        
                                 dS dS )zInitialize the weightsr   )meanstdN      ?)rI   initializer_range
isinstancer   r   rX   r}   datanormal_r   zero_	Embeddingr   fill_rH   inittrunc_normal_r\   rW   $InstructBlipForConditionalGenerationInstructBlipModelquery_tokens)r8   r   factors      r9   _init_weightsz)InstructBlipPreTrainedModel._init_weightsH  s   .fry")455 	-M&&CV&<<<{& &&((((( '&-- 		-M&&CV&<<<<<-- 	-K""$$$M$$S))))) <== 	-G!!&";#6!RRRG!!&"8s!OOOOO!EGX YZZ 	-$**,,,,,	- 	-r;   N)r?   r@   rA   r%   rE   base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr   rF   r;   r9   r   r   5  sl         &*#"&N!  - - - - -r;   r   c            
            e Zd ZdZdef fdZe	 d	deej	                 de
e         deeef         fd            Z xZS )
InstructBlipEncodera  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipEncoderLayer`].

    Args:
        config (`InstructBlipConfig`):
            The corresponding vision configuration for the `InstructBlipEncoder`.
    rI   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rF   )r   )r6   r   rI   s     r9   
<listcomp>z0InstructBlipEncoder.__init__.<locals>.<listcomp>j  s"    $o$o$o!%=f%E%E$o$o$or;   F)	rO   rP   rI   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr]   s    `r9   rP   zInstructBlipEncoder.__init__g  sa    m$o$o$o$ouU[UmOnOn$o$o$opp&+###r;   Nr   r   r0   c                 P    |}| j         D ]} ||fd|i|}t          |          S )Nr   last_hidden_state)r  r   )r8   inputs_embedsr   r   r   encoder_layers         r9   r   zInstructBlipEncoder.forwardm  sZ     &![ 	 	M)M -  MM ????r;   r   )r?   r@   rA   rB   r%   rP   r   r   rC   r   r   r   r   r<   r   r   r   r   s   @r9   r  r  ]  s         ,1 , , , , , ,  26@ @ !.@ +,	@
 
uo%	&@ @ @ ^@ @ @ @ @r;   r  c                        e Zd ZU dZeed<   eedZdef fdZ	 e
d          e	 	 ddeej                 ded	ee         d
eeef         fd                        Zd Z xZS )InstructBlipVisionModelry   rI   )r   
attentionsc                    t                                          |           || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |                                  d S r   )rO   rP   rI   rQ   rH   r_   r  encoderr   r   r   post_layernorm	post_init)r8   rI   rR   r^   s      r9   rP   z InstructBlipVisionModel.__init__  sx       &	6v>>*622 l9&:OPPPr;   F)tie_last_hidden_statesNrx   r   r0   c                    |t          d          |                     ||          } | j        dd|i|}|j        }|                     |          }|d d dd d f         }|                     |          }t          ||          S )Nz You have to specify pixel_values)rx   r  r   r  pooler_outputrF   )r   r_   r  r  r  r   )r8   ry   rx   r   r   encoder_outputsr  pooled_outputs           r9   r   zInstructBlipVisionModel.forward  s     ?@@@Oghh+74< ,
 ,
',
,
 ,

 ,= //0ABB)!!!Q'2++M::)/'
 
 
 	
r;   c                     | j         S r   )r_   r>   s    r9   get_input_embeddingsz,InstructBlipVisionModel.get_input_embeddings  s
    r;   )NF)r?   r@   rA   main_input_namer'   rE   r   r   _can_record_outputsrP   r   r   r   rC   rD   r   r   r   r   r<   r   r   r  r   r   s   @r9   r  r    s         $O$$$$1+ 
	7 	 	 	 	 	 	 u555 59).
 
u01
 #'
 +,	

 
u00	1
 
 
 ^ 65
6      r;   r  c                   `     e Zd Zd fd	Zd Zd Zd Zd Zd Z	 	 	 	 dd	e	e
         fd
Z xZS )r   Fc                    t                                                       || _        |j        |j        z  dk    r.t          |d          st          d|j        |j        fz            |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        |rIt          j        |j        | j	                  | _        t          j        |j        | j	                  | _        nHt          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j                  | _        t'          |dd          | _        | j        dk    s| j        dk    r6|j        | _        t          j        d|j        z  d	z
  | j                  | _        d
| _        d S )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr    r$   F)rO   rP   rI   rQ   r   hasattrr   r   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   r4   r#  max_position_embeddingsr   distance_embeddingsave_attentionr8   rI   is_cross_attentionr^   s      r9   rP   z.InstructBlipQFormerMultiHeadAttention.__init__  s    ::a??PVXhHiHi?^%v'ABC  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
 	Ky!;T=OPPDH6#=t?QRRDJJy!3T5GHHDH6#5t7IJJDJz&"EFF'.v7PR\']']$'>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD##r;   c                     || _         d S r   attn_gradients)r8   r4  s     r9   save_attn_gradientsz9InstructBlipQFormerMultiHeadAttention.save_attn_gradients  s    ,r;   c                     | j         S r   r3  r>   s    r9   get_attn_gradientsz8InstructBlipQFormerMultiHeadAttention.get_attn_gradients  s    ""r;   c                     || _         d S r   attention_map)r8   r:  s     r9   save_attention_mapz8InstructBlipQFormerMultiHeadAttention.save_attention_map  s    *r;   c                     | j         S r   r9  r>   s    r9   get_attention_mapz7InstructBlipQFormerMultiHeadAttention.get_attention_map  s    !!r;   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nrc   r   r    r$   r	   )re   r   r(  rq   rn   )r8   xnew_x_shapes      r9   transpose_for_scoresz:InstructBlipQFormerMultiHeadAttention.transpose_for_scores  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r;   Nr   c                    |d u}|rS|                      |                     |                    }|                      |                     |                    }	|}nP|                      |                     |                    }|                      |                     |                    }	|                     |          }
|                      |
          }t	          j        ||                    dd                    }| j        dk    s| j        dk    r4|                                d         }t	          j	        |t          j
        |j                                      dd          }t	          j	        |t          j
        |j                                      dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt	          j        d||          }||z   }n?| j        dk    r4t	          j        d||          }t	          j        d	||          }||z   |z   }|t%          j        | j                  z  }|j        }|||z   } t+          j        d
          |                              |          }|r6| j        r/|                     |           |                    | j                   |                     |          }|||z  }t	          j        ||	          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }||fS )Nrc   r   r%  r&  r$   r|   devicer{   zbhld,lrd->bhlrzbhrd,lrd->bhlrrh   r   r    r	   )rA  r   r   r   rC   r   r   r#  re   arangelongrD  rq   r.  r-  r~   r|   einsummathsqrtr(  r   Softmaxr/  r;  register_hookr5  r   rn   r   r)  )r8   r   r   r   encoder_hidden_statesencoder_attention_maskr   r1  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                             r9   r   z-InstructBlipQFormerMultiHeadAttention.forward  sx    3$> 	O11$((;P2Q2QRRI33DJJ?T4U4UVVK3NN11$((=2I2IJJI33DJJ}4M4MNNK JJ}55//0ABB !<Y5H5HR5P5PQQ'>99T=Y]q=q=q&++--a0J"\*EJ}OcdddiijlnoppN"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ!1!7%/.@ -"*,,,-=>>AABXYY 	D$"5 	D##O444))$*BCCC #',,"?"?  &=	&I#%<kJJ%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDo--r;   r   NNNN)r?   r@   rA   rP   r5  r7  r;  r=  rA  r   r   r   r   r   s   @r9   r   r     s        $ $ $ $ $ $8- - -# # #+ + +" " "% % % "#I. I. +,I. I. I. I. I. I. I. I.r;   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )r   c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S r   )rO   rP   r   r   rQ   denser   r   r+  hidden_dropout_probr   r]   s     r9   rP   z&InstructBlipQFormerSelfOutput.__init__0  sf    Yv163EFF
f&8f>STTTz&"<==r;   r   input_tensorr0   c                     |                      |          }|                     |          }|                     ||z             }|S r   rc  r   r   r8   r   re  s      r9   r   z%InstructBlipQFormerSelfOutput.forward6  @    

=11]33}|'CDDr;   r   r   s   @r9   r   r   /  i        > > > > >U\  RWR^        r;   r   c                        e Zd Zd fd	Zd Z	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
e	e
         dej        fdZ xZS )InstructBlipQFormerAttentionFc                     t                                                       t          ||          | _        t	          |          | _        t                      | _        d S r   )rO   rP   r   	attentionr   outputsetpruned_headsr0  s      r9   rP   z%InstructBlipQFormerAttention.__init__?  sL    >vGYZZ3F;;EEr;   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r$   rh   )lenr   rn  r   r(  rq  r   r   r   r   ro  rc  r)  union)r8   headsindexs      r9   prune_headsz(InstructBlipQFormerAttention.prune_headsE  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r;   Nr   r   r   rL  rM  r   r0   c           	      ^     | j         d|||||d|\  }}|                     ||          }	|	S )N)r   r   r   rL  rM  rF   )rn  ro  )
r8   r   r   r   rL  rM  r   r   r   attention_outputs
             r9   r   z$InstructBlipQFormerAttention.forwardW  sZ     ( 
')"7#9
 
 
 
Q  ;;{MBBr;   r   r`  )r?   r@   rA   rP   rw  rC   r   r   rD   r   r   r   r   r   s   @r9   rl  rl  >  s        " " " " " "; ; ;* 7;15=A>B   |  !!23  E-.	 
  ((9:  !)): ;  +,  
               r;   rl  c                   B     e Zd Z fdZdej        dej        fdZ xZS )InstructBlipQFormerIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )rO   rP   r   r   rQ   r   rc  r   r   strr
   intermediate_act_fnr]   s     r9   rP   z(InstructBlipQFormerIntermediate.__init__n  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r;   r   r0   c                 Z    |                      |          }|                     |          }|S r   )rc  r~  r   s     r9   r   z'InstructBlipQFormerIntermediate.forwardv  s,    

=1100??r;   r   r   s   @r9   r{  r{  m  s^        9 9 9 9 9U\ el        r;   r{  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )InstructBlipQFormerOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )rO   rP   r   r   r   rQ   rc  r   r   r+  rd  r   r]   s     r9   rP   z"InstructBlipQFormerOutput.__init__~  sf    Yv79KLL
f&8f>STTTz&"<==r;   r   re  r0   c                     |                      |          }|                     |          }|                     ||z             }|S r   rg  rh  s      r9   r   z!InstructBlipQFormerOutput.forward  ri  r;   r   r   s   @r9   r  r  }  rj  r;   r  c                   N     e Zd Z fdZ	 	 	 	 	 ddee         fdZd Zd Z xZ	S )	InstructBlipQFormerLayerc                    t                                                       |j        | _        d| _        t	          |          | _        || _        ||j        z  dk    rt	          |d          | _        d| _	        nd| _	        t          |          | _        t          |          | _        t          |          | _        t          |          | _        d S )Nr$   r   T)r1  F)rO   rP   chunk_size_feed_forwardseq_len_dimrl  rn  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionr{  intermediater  ro  intermediate_queryoutput_query)r8   rI   r  r^   s      r9   rP   z!InstructBlipQFormerLayer.__init__  s    '-'E$5f=="v771<<">vZ^"_"_"_D'+D$$',D$;FCC/77"A&"I"I5f==r;   Nr   r   c           
          | j         |f||d|}|dk    r|d d d |d d f         }	| j        r$|t          d           | j        |	f||||d|}	t	          | j        | j        | j        |	          }
|j        d         |k    r`t	          | j	        | j        | j        |d d |d d d f                   
                    |
j                  }t          j        |
|gd          }
n!t	          | j	        | j        | j        |          }
|
S )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   rL  rM  r$   rh   )rn  r  r   r  r   feed_forward_chunk_queryr  r  rj   feed_forward_chunkr~   rD  rC   rr   )r8   r   r   r   rL  rM  query_lengthr   ry  query_attention_outputlayer_outputlayer_output_texts               r9   r   z InstructBlipQFormerLayer.forward  s    *4>
)
 
 	
 
 !%5aaa,6I%J"' 
(0$%efff)<)<**#1'*?+A* * * *& 5-, &	 L  %a(<77$=+0$$QQQqqq%89	% %
 "\()) "  %y,8I)JPQRRR4',  	 L r;   c                 \    |                      |          }|                     ||          }|S r   )r  ro  r8   ry  intermediate_outputr  s       r9   r  z+InstructBlipQFormerLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr;   c                 \    |                      |          }|                     ||          }|S r   )r  r  r  s       r9   r  z1InstructBlipQFormerLayer.feed_forward_chunk_query  s4    "556FGG(()<>NOOr;   NNNNr   )
r?   r@   rA   rP   r   r   r   r  r  r   r   s   @r9   r  r    s        > > > > >. "#6 6 +,6 6 6 6p  
      r;   r  c                   R     e Zd Z fdZe	 	 	 	 	 ddee         fd            Z xZS )InstructBlipQFormerEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 0    g | ]}t          |          S rF   )r  )r6   r  rI   s     r9   r  z7InstructBlipQFormerEncoder.__init__.<locals>.<listcomp>  s$    jjjY%fi88jjjr;   F)	rO   rP   rI   r   r  r  r  layerr	  r]   s    `r9   rP   z#InstructBlipQFormerEncoder.__init__  sg    ]jjjj%PVPhJiJijjj
 

 ',###r;   Nr   r   c                     t          | j        j                  D ]*}| j        |         }	|||         nd }
 |	|||
|f||d|}+t	          |          S )N)rM  r  r  )r  rI   r  r  r   )r8   r   r   r   rL  rM  r  r   ilayer_modulelayer_head_masks              r9   r   z"InstructBlipQFormerEncoder.forward  s     t{455 	 	A:a=L.7.CillO(L%	
 (>)   MM 9+
 
 
 	
r;   r  )	r?   r@   rA   rP   r   r   r   r   r   r   s   @r9   r  r    s        , , , , ,  "#
 
 +,
 
 
 
 
 
 
 
r;   r  c                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )r   z;Construct the embeddings from word and position embeddings.c                 *   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _        t          j        |j                  | _        |                     dt#          j        |j                                      d          d           t)          |dd          | _        || _        d S )	N)padding_idxr   position_ids)r$   rc   F)
persistentr#  r$  )rO   rP   r   r   
vocab_sizerQ   pad_token_idword_embeddingsr-  position_embeddingsr   r   	layernormr+  rd  r   register_bufferrC   rE  r   r4   r#  rI   r]   s     r9   rP   z&InstructBlipQFormerEmbeddings.__init__  s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$r;   Nr   c                    ||                                 d         }nd}|(| j        d d |||z   f                                         }|m|                     |          }| j        dk    r2|                     |                    |j                            }||z   }|t          j	        ||fd          }n|}|                    | j
        j        j                  }| 
                    |          }|                     |          }|S )Nr$   r   r$  rh   )re   r  cloner  r#  r  r~   rD  rC   rr   r  r}   r|   r   )r8   	input_idsr  query_embedspast_key_values_lengthrS  r_   r  s           r9   r   z%InstructBlipQFormerEmbeddings.forward  s     "))!,JJJ,QQQ0FVlIl0l-lmssuuL --i88J+z99&*&>&>|zO`?a?a&b&b#'*==
'"Yj'AqIII
%J]]4>#8#>??
^^J//
\\*--
r;   )NNNr   )r?   r@   rA   rB   rP   r   r   r   s   @r9   r   r     s`        EE    $         r;   r   c                       e Zd ZdZdZdZdZdZe e	e
dd          g e	e
dd          gdZdef fd	Zd
 Zd Zd Z	 ddej        dee         dej        dedej        f
dZee	 	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 deej                 deej                 dee         deeej                 e f         fd                        Z! xZ"S )InstructBlipQFormerModelz
    Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr$   z
.attention)rv  
layer_namez.crossattention)r   r  cross_attentionsrI   c                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r   )rO   rP   rI   r   r_   r  r  r  r]   s     r9   rP   z!InstructBlipQFormerModel.__init__S  sV       7??1&99r;   c                     | j         j        S r   r_   r  r>   s    r9   r  z-InstructBlipQFormerModel.get_input_embeddings]  s    ..r;   c                     || j         _        d S r   r  r8   r   s     r9   set_input_embeddingsz-InstructBlipQFormerModel.set_input_embeddings`  s    */'''r;   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  rn  rw  )r8   heads_to_pruner  ru  s       r9   _prune_headsz%InstructBlipQFormerModel._prune_headsc  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr;   r   input_shaperD  	has_queryr0   c                 *   |                                 dk    r|dddddddf         }nD|                                 dk    r|ddddddf         }nt          d| d|j         d          |                    | j                  }d|z
  d	z  }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r	   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )r{   r   g     )ri   r   rj   r~   r|   )r8   r   r  rD  r  extended_attention_masks         r9   get_extended_attention_maskz4InstructBlipQFormerModel.get_extended_attention_maskk  s    . 1$$&4QQQaaa]&C##!!Q&& '5QQQdAAA5E&F##sKss\j\psss   #:"<"<4:"<"N"N#&)@#@H"L&&r;   Nr  r  r  r   rL  rM  r   c           	      \    ||t          d          ||j        d         nd}	                     |||          }
|
                                dd         }|\  }}|
j        }|t          j        ||f|          }                     |||          }|t          |t                    r|d                                         \  }}}n|                                \  }}}||f}t          |t                    r fd|D             }nF|,t          j        ||          } 
                    |          }n 
                    |          }nd}                     | j        j                  }  j        |
f|||||	d	|}|j        }|dddddf         }t!          ||
          S )a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner$   r   )r  r  r  rc   )rD  c                 :    g | ]}                     |          S rF   )invert_attention_mask)r6   maskr8   s     r9   r  z4InstructBlipQFormerModel.forward.<locals>.<listcomp>  s(    2w2w2wX\43M3Md3S3S2w2w2wr;   )r   r   rL  rM  r  r  )r   rj   r_   re   rD  rC   onesr  r   listr  get_head_maskrI   r  r  r  r   )r8   r  r   r  r  r   rL  rM  r   r  embedding_outputr  r   rS  rD  r  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr  sequence_outputr  s   `                       r9   r   z InstructBlipQFormerModel.forward  s>   $ !5VWWW0<0H|)!,,a??%% + 
 
 '++--crc2!,
J!(!"Z*j)A6RRRN #'"B"B>S^`f"g"g !,/66 ^AVWXAYA^A^A`A`>"$;QQAVA[A[A]A]>"$;Q$68O#P 0$77 e2w2w2w2w`v2w2w2w//'/).4HQW)X)X)X&262L2LMc2d2d//262L2LMc2d2d//.2+ &&y$+2OPP	+74<,
2"7#B%,
 ,
 ,
 ,
 *;'1aaa0;-'
 
 
 	
r;   r   )NNNNNN)#r?   r@   rA   rB   r   r   r   r   r  r   r   r  r&   rP   r  r  r  rC   r   r<   r   rD  r   r  r   r   
LongTensorr   rD   r   r   r   r   r   r   r   s   @r9   r  r  >  s4        
 #( N 2N@Vbccc
 N@Vghhh
 8      / / /0 0 0C C C  )' )')' 3Z)' 	)'
 )' 
)' )' )' )'V  7;37/315=A>BO
 O
#O
 !!23O
 u/0	O

 u|,O
 E-.O
  ((9:O
 !)): ;O
 +,O
 
uU&')UU	VO
 O
 O
 ^ O
 O
 O
 O
 O
r;   r  z[
    InstructBLIP base Model consisting of language model, qformer and vision encoder.
    c                       e Zd ZdZdgZdef fdZd Zd Zd Z	d Z
d	ej        d
ej        fdZee	 	 	 	 	 	 	 ddej        dej        deej                 d	eej                 deej                 deej                 deej                 d
eej                 dedee         deeef         fd                        Z xZS )r   ry   r   rI   c                    t                                          |           t          |j                  | _        t          j        t          j        d|j	        |j
        j                            | _        t          |j
                  | _        t          j        |j
        j        |j        j                  | _        t%          j        |j                  | _        | j        j        $| j                            | j        j                   | j        j        $| j                            | j        j                   |                                  d S Nr$   )rO   rP   r  vision_configvision_modelr   rU   rC   r   num_query_tokensqformer_configrQ   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr   extend_keep_in_fp32_modulesr  r]   s     r9   rP   zInstructBlipModel.__init__  s      3F4HIILQ8OQWQfQr)s)stt/0EFF#%9V-B-NPVPbPn#o#o '3F4FGG0<"))$*=*OPPP4@&--d.A.WXXX 	r;   c                 4    | j                                         S r   r  r  r>   s    r9   r  z&InstructBlipModel.get_input_embeddings      "77999r;   c                 :    | j                             |           d S r   r  r  r  s     r9   r  z&InstructBlipModel.set_input_embeddings	      0077777r;   c                     | j         j        s8| j        j        | j        j        _        | j        j        | j        j        _        d S d S r   rI   use_decoder_only_language_modelr  sharedr  embed_tokensdecoderr>   s    r9   _tie_weightszInstructBlipModel._tie_weights  J    {: 	R7;7J7QD'47;7J7QD'444	R 	Rr;   c                 
   | j         }t          |          dk    r@d|vr<t          j                                        dk    rt
                              d           t          | j        d          rd| j        j	        _
        dS dS z
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maprs  rC   cudadevice_countloggerwarningr'  r  r  io_same_devicer8   r  s     r9   _preprocess_acceleratez(InstructBlipModel._preprocess_accelerate      
 *}!!&6m&K&KPUPZPgPgPiPilmPmPmNNM   4&
33 	?:>D(777	? 	?r;   r  r  c                 t   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                    d          	                    |          
                    |j                  }|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        NrC  rc   r  rC   r   rI   image_token_idrF  rD  all	unsqueeze	expand_asr~   r8   r  r  special_image_masks       r9   get_placeholder_maskz&InstructBlipModel.get_placeholder_mask%       !.2M$2K2K2M2MT[7uzR_Rfggg3 3 " "4!7!7!;!;!*dk.H!H/99"==GGVVYYZgZnoo!!r;   NFqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskrx   r   r0   c
           	          | j         d||	d|
}|d         }t          j        |                                dd         t          j        |j                  }| j                            |j        d         dd          }t          j        |                                dd         t          j        |j                  }|t          j	        |          }t          j
        ||gd          } | j        d|||||d|
}|d         ddd|                    d          ddf         }|8 | j                                        |          }|t          j	        |          }|                     |          }|                    |j        |j                  }|                     ||	          }|                    ||          }| j        j        r | j        d||d
|
}n | j        d||||d|
}t+          |||          S )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        )ry   rx   r   Nrc   rC  r$   rh   )r  r   r  rL  rM  r  r  r   )r  r   r  r  r3   rF   )r  rC   r  re   rF  rD  r   r   rj   	ones_likerr   r  r  r  r  r~   r|   r	  masked_scatterrI   r  r*   )r8   ry   r  r  r  r   r  r  r  rx   r   r-   image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                        r9   r   zInstructBlipModel.forward4  s   P +* 
%%=
 
 
 

 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"!&,@BX+Y_`!a!a!a$ 
'1%".#7
 
 
 
 %Q'+A\->->q-A-A+A111(DE FD/DDFFyQQM%!&!;!; !% 8 8 F F 5 8 89M}Ob c c!66yP]6^^%445GI^__;6 	)d) +-   GG *d) +-"3'=	 
  G ?))#*
 
 
 	
r;   )NNNNNNF)r?   r@   rA   r  r  r%   rP   r  r  r  r  rC   r  rD   r	  r   r   r   r   r   r   r   r   r<   r*   r   r   r   s   @r9   r   r     s        %O+,1      &: : :8 8 8R R R
? ? ?("e.> "uO` " " " " 
 >B15598<=A04).^
 ^
'^
 !,^
 !))9 :	^

 E-.^
 !!12^
 $E$45^
 !))9 :^
  -^
 #'^
 -.^
 
uEE	F^
 ^
 ^
 ^ ^
 ^
 ^
 ^
 ^
r;   r   a  
    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                       e Zd ZU eed<   dZdZdgZdef fdZd Z	d Z
d Zd	ej        fd
Zd Zd Zd Zd Z	 	 	 d dej        dej        deej                 dee         dee         f
dZdej        dej        fdZee	 	 	 	 	 	 	 	 d!dej        dej        deej                 deej                 deej                 deej                 deej                 deej                 deej                 dedee         d	eee f         fd                        Z! ej"                    	 	 	 	 	 	 d"dej        deej                 deej                 deej                 deej                 deej                 ded	ej        fd            Z# xZ$S )#r   rI   ry   Tr   c                    t                                          |           t                              |j                  | _        t          j        t          j	        d|j
        |j        j                            | _        t                              |j                  | _        t          j        |j        j        |j        j                  | _        |j        rt)          j        |j                  }nt-          j        |j                  }|j        | j                            |j                   |j        | j                            |j                   || _        |                                  d S r  )rO   rP   r  _from_configr  r  r   rU   rC   r   r  r  rQ   r   r  r  r   r  r  r  r"   r  r#   r   r  r  r  r  )r8   rI   r  r^   s      r9   rP   z-InstructBlipForConditionalGeneration.__init__  s1      3@@AUVVLQ8OQWQfQr)s)stt/<<V=RSS#%9V-B-NPVPbPn#o#o 1 	S1=f>PQQNN2>v?QRRN+7")).*JKKK/;&--n.RSSS, 	r;   c                 4    | j                                         S r   r  r>   s    r9   r  z9InstructBlipForConditionalGeneration.get_input_embeddings  r  r;   c                 :    | j                             |           d S r   r  r  s     r9   r  z9InstructBlipForConditionalGeneration.set_input_embeddings  r  r;   c                 :    | j                             |           d S r   )r  set_output_embeddings)r8   new_embeddingss     r9   r!  z:InstructBlipForConditionalGeneration.set_output_embeddings  s    11.AAAAAr;   r0   c                 4    | j                                         S r   )r  get_output_embeddingsr>   s    r9   r$  z:InstructBlipForConditionalGeneration.get_output_embeddings  s    "88:::r;   c                 4    | j                                         S r   )r  get_encoderr>   s    r9   r&  z0InstructBlipForConditionalGeneration.get_encoder      "..000r;   c                 4    | j                                         S r   )r  get_decoderr>   s    r9   r)  z0InstructBlipForConditionalGeneration.get_decoder  r'  r;   c                     | j         j        s8| j        j        | j        j        _        | j        j        | j        j        _        d S d S r   r  r>   s    r9   r  z1InstructBlipForConditionalGeneration._tie_weights  r  r;   c                 
   | j         }t          |          dk    r@d|vr<t          j                                        dk    rt
                              d           t          | j        d          rd| j        j	        _
        dS dS r  r  r  s     r9   r  z;InstructBlipForConditionalGeneration._preprocess_accelerate  r  r;   NFr  r  rx   return_dictc                    |                      ||d          }|d         }t          j        |                                dd         t          j        |j                  }| j                            |j        d         dd          }	t          j        |	                                dd         t          j        |j                  }
|t          j	        |          }t          j
        |
|gd          }|                     |||	||d	          }|d         ddd|	                    d          ddf         }|                     |          }|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)ry   rx   r,  r   Nrc   rC  r$   rh   )r  r   r  rL  rM  r,  )r  rC   r  re   rF  rD  r   r   rj   r  rr   r  r  )r8   ry   r  r  rx   r,  r-   r  r  r   r  r  r  r  s                 r9   get_image_featuresz7InstructBlipForConditionalGeneration.get_image_features  s   " **%%= + 
 

 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"!&,@BX+Y_`!a!a!a'1%".#7 % 
 
 %Q'+A\->->q-A-A+A111(DE !% 8 8 F F 	H(.-GG$$r;   r  r  c                 t   |e| |                                  t          j        | j        j        t          j        |j                            k    }|                    d          }n|| j        j        k    }|                    d          	                    |          
                    |j                  }|S r  r  r  s       r9   r	  z9InstructBlipForConditionalGeneration.get_placeholder_mask  r
  r;   r   r  r  labelsr   c           	      H   |                      ||||
d          \  }}}| |                                 |          }|t          j        |          }|                    |j        |j                  }|                     ||          }|                    ||          }| j	        j
        r= | j        d||d|}|d         }d}|	  | j        d||	| j	        j        j        d|}n&d|d<    | j        d|||||	d	|}|j        }|j        }t#          |||||
          S )aD  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
        >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        >>> prompt = "What is unusual about this image?"
        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     min_length=1,
        ...     top_p=0.9,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ...     temperature=1,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
        ```Tr  r  rx   r,  Nr  r  r   )r,   r0  r  r,  )r  r   r  r  r0  )r+   r,   r-   r.   r/   rF   )r.  r  rC   r  r~   rD  r|   r	  r  rI   r  r  loss_functionr  r  r+   r,   r*   )r8   ry   r  r  r  r   r  r  r  r0  rx   r   r  r-   r  r  r  r,   r+   s                      r9   r   z,InstructBlipForConditionalGeneration.forward.  s   Z @D?V?V/#9%= @W @
 @
<~}  7D5577	BBM!"_Y77N 5 8 89M}Ob c c!66yP]6^^%445GI^__;6 	$)d) +-   G
 QZFD!)t) !&T[=T=_ ci 
 %)F=!)d) +-"3'=   G <D^F>))#*
 
 
 	
r;   c                    t          | d          r|                                  |j        d         }	|                     ||||d          \  }
}}||l| j        j        g| j        j        z  }|| j        j        j        gz   }t          j
        |gt          j        |j                  }|                    |	d          } |                                 |          }|t          j        |          }|
                    |j        |j                  }
|                     ||          }|                    ||
          }||d	}| j        j        j        s||d
<    | j        j        di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
                Input images to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr2  NrC  r$   r  r  r  rF   )r'  r  rj   r.  rI   image_token_indexr  r  bos_token_idrC   r   rF  rD  repeatr  r  r~   r|   r	  r  r  is_encoder_decodergenerate)r8   ry   r  r  r  r   r  rx   generate_kwargsr   r  r-   r  image_tokensstart_tokensr  inputsr  s                     r9   r9  z-InstructBlipForConditionalGeneration.generate  s   D 4)) 	*'')))!'*
?C?V?V/#9%= @W @
 @
<~}    $ =>A]]+t{/F/S.TT!L,uzR^Refff	%,,Z;;	7D5577	BBM!"_Y77N 5 8 89M}Ob c c!66yP]6^^%445GI^__#0NSS")< 	,"+F;.$%.KKK?KKr;   )NFF)NNNNNNNF)NNNNNF)%r?   r@   rA   r%   rE   r  r   r  rP   r  r  r!  r   Moduler$  r&  r)  r  r  rC   rD   r  r   r   r.  r	  r   r   r   r   r   r<   r*   r   no_gradr9  r   r   s   @r9   r   r     sy         $O!+,1      4: : :8 8 8B B B;ry ; ; ; ;1 1 11 1 1R R R? ? ?0 >B38&+/% /%'/% !+/% !))9 :	/%
 #+4./% d^/% /% /% /%b"e.> "uO` " " " " 
 >B15598<=A59-1).}
 }
'}
 !,}
 !))9 :	}

 E-.}
 !!12}
 $E$45}
 !))9 :}
   12}
 )*}
 #'}
 +,}
 
uEE	F}
 }
 }
 ^ }
~ U]__ 9==A045959).C C'C $E$45C !))9 :	C
 E,-C !!12C   12C #'C 
	C C C _C C C C Cr;   r   )r  r   r   r   r  )r   )NrB   rH  dataclassesr   typingr   r   r   r   rC   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor!   r"   r#   configuration_instructblipr%   r&   r'   
get_loggerr?   r  r*   r>  rH   r   floatr   r   r   r   r   r  r  r   r   rl  r{  r  r  r  r   r  r   r   __all__rF   r;   r9   <module>rQ     sa   " !  ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1        ! ! ! ! ! ! ) ) ) ) ) ) B B B B B B 9 9 9 9 9 9            G F F F F F F F & & & & & & l l l l l l l l l l j j j j j j j j j j j j j j j j ? ? ? ? ? ? ? ? I I I I I I I I I I o o o o o o o o o o 
	H	%	%   
 
 
 
 
k 
 
  
<G G G G G29 G G Gd % %I%<% 
% <	%
 U\*% % % % % %0I) I) I) I) I)BI I) I) I)Z    bi        9   D #- #- #- #- #-/ #- #- #-N@ @ @ @ @") @ @ @F1 1 1 1 19 1 1 1hw. w. w. w. w.BI w. w. w.v    BI   +  +  +  +  + 29 +  +  + ^    bi        	   U U U U U9 U U Ur$
 $
 $
 $
 $
 $
 $
 $
N0 0 0 0 0BI 0 0 0fi
 i
 i
 i
 i
: i
 i
 i
X   
e
 e
 e
 e
 e
3 e
 e
 
e
P   S S S S S+F S S Sl
  r;   