
     `i                        d dl Z d dlmZmZ d dlZd dlmZmZmZ d dlm	Z	 ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#  ej$        e%          Z& G d dej'                  Z( G d dej'                  Z) G d dej'                  Z* G d dej'                  Z+ G d dej'                  Z, G d dej'                  Z- G d de          Z. G d d ej'                  Z/ G d! d"ej'                  Z0 G d# d$ej'                  Z1 G d% d&ej'                  Z2 G d' d(ej'                  Z3 G d) d*e          Z4 G d+ d,e4          Z5 G d- d.e4e          Z6g d/Z7dS )0    N)OptionalUnion)Tensordevicenn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging)deprecate_kwarg   )BlipTextConfigc                        e Zd ZdZ fdZ	 	 	 	 ddeej                 deej                 deej                 de	d	ej
        f
d
Z xZS )BlipTextEmbeddingsz;Construct the embeddings from word and position embeddings.c                 *   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _
        t          j        |j                  | _        |                     dt!          j        |j                                      d          d           t'          |dd          | _        || _        d S )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr$   configselfr9   	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/blip/modeling_blip_text.pyr'   zBlipTextEmbeddings.__init__/   s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c  f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$    Nr   	input_idsr!   inputs_embedspast_key_values_lengthreturnc                 |   ||                                 }n|                                 d d         }|d         }|| j        d d |||z   f         }||                     |          }|}| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr"   r   r%   )sizer!   r,   r$   r.   r/   r3   )	r;   r?   r!   r@   rA   input_shape
seq_length
embeddingsr.   s	            r=   forwardzBlipTextEmbeddings.forwardA   s      #..**KK',,..ss3K ^
,QQQ0FVlIl0l-lmL  00;;M"
':55"&":":<"H"H--J^^J//
\\*--
r>   )NNNr   )__name__
__module____qualname____doc__r'   r   r5   
LongTensorFloatTensorintr   rH   __classcell__r<   s   @r=   r   r   ,   s        EE    ( 153759&' E,- u/0   12	
 !$ 
       r>   r   c                   B    e Zd Zd fd	Zd Zd Zd Zd Z eddd	
          	 	 	 	 	 	 	 dde	j
        dee	j                 dee	j                 dee	j                 dee	j                 dee         dee         dee	j
                 dee	j
                 fd            Z xZS )BlipTextSelfAttentionNc                    t                                                       || _        |j        |j        z  dk    r.t          |d          st          d|j        |j        fz            |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        || _
        t          j        |j        | j	                  | _        |rIt          j        |j        | j	                  | _        t          j        |j        | j	                  | _        nHt          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j                  | _        t)          |dd          | _        | j        dk    s| j        dk    r8|j        | _        t          j        d|j        z  d	z
  | j                  | _        d S d S )
Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r$   r%   relative_keyrelative_key_query   r   )r&   r'   r9   r*   num_attention_headshasattr
ValueErrorrO   attention_head_sizeall_head_size	layer_idxr   Linearqueryencoder_hidden_sizekeyvaluer1   attention_probs_dropout_probr3   r8   r$   r-   r(   distance_embeddingr;   r9   is_cross_attentionr^   r<   s       r=   r'   zBlipTextSelfAttention.__init__a   s    ::a??PVXhHiHi?^%v'ABC  
 $*#= #&v'9F<V'V#W#W !58PP"Yv143EFF
 	Ky!;T=OPPDH6#=t?QRRDJJy!3T5GHHDH6#5t7IJJDJz&"EFF'.v7PR\']']$'>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD### >r=qr>   c                     || _         d S Nattn_gradients)r;   rk   s     r=   save_attn_gradientsz)BlipTextSelfAttention.save_attn_gradients}   s    ,r>   c                     | j         S ri   rj   r;   s    r=   get_attn_gradientsz(BlipTextSelfAttention.get_attn_gradients   s    ""r>   c                     || _         d S ri   attention_map)r;   rr   s     r=   save_attention_mapz(BlipTextSelfAttention.save_attention_map   s    *r>   c                     | j         S ri   rq   rn   s    r=   get_attention_mapz'BlipTextSelfAttention.get_attention_map   s    !!r>   past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskoutput_attentionscache_positionrB   c	                 F   |j         \  }	}
}|                     |                              |	d| j        | j                                      dd          }|d u}|r|n|}d}|Ht          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |                              |	d| j        | j                                      dd          }|                     |                              |	d| j        | j                                      dd          }|N|s|nd }|                    ||| j
        d|i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }| j        dk    s| j        d	k    r4|                                d         }
t'          j        |
t&          j        |j        
                              dd          }t'          j        |
t&          j        |j        
                              dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt'          j        d||          }||z   }n?| j        d	k    r4t'          j        d||          }t'          j        d||          }||z   |z   }|t?          j         | j                  z  }|||                    |j                  z   } tC          j"        d          |          }| #                    |          }|||z  }t'          j        ||          }|$                    dddd          %                                }|                                d d         | j&        fz   } |j        | }||fS )Nr"   r   rX   Fr   TrV   rW   )dtyper   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   )'shaper`   viewrY   r\   	transpose
isinstancer   
is_updatedgetr^   cross_attention_cacheself_attention_cachelayerskeysvaluesrb   rc   updater5   matmulr$   rD   r6   longr   re   r-   tor   einsummathsqrtr   Softmaxr3   permute
contiguousr]   )r;   r|   r}   r~   r   r   rw   r   r   
batch_sizerF   _query_layerrg   r   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                                  r=   rH   zBlipTextSelfAttention.forward   sf    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 3$>3EY//>
&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK ((j"d&>@XYY1a  

>**j"d&>@XYY1a  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> !<Y5H5HR5P5PQQ'>99T=Y]q=q=q&++--a0J"\*EJ}OcdddiijlnoppN"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.2C2CDTD[2\2\\ -"*,,,-=>> #',,"?"?  &=	&I#%<kJJ%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDo--r>   ri   NNNNNFN)rI   rJ   rK   r'   rl   ro   rs   ru   r   r5   r   r   rN   r   booltuplerH   rP   rQ   s   @r=   rS   rS   `   sr       u u u u u u8- - -# # #+ + +" " " _%0A6RRR 7;15=A>B+/,115h. h.|h. !!23h. E-.	h.
  ((9:h. !)): ;h. "%h. $D>h. !.h. 
u|	h. h. h. SRh. h. h. h. h.r>   rS   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )BlipTextSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr   )r&   r'   r   r_   r*   denser/   r0   r1   r2   r3   r:   s     r=   r'   zBlipTextSelfOutput.__init__   sf    Yv163EFF
f&8f>STTTz&"<==r>   r|   input_tensorrB   c                     |                      |          }|                     |          }|                     ||z             }|S ri   r   r3   r/   r;   r|   r   s      r=   rH   zBlipTextSelfOutput.forward   @    

=11]33}|'CDDr>   rI   rJ   rK   r'   r5   r   rH   rP   rQ   s   @r=   r   r      i        > > > > >U\  RWR^        r>   r   c                       e Zd Zd fd	Zd Z eddd          	 	 	 	 	 	 dd	ej        d
eej	                 deej	                 deej	                 dee
         dee         deej                 deej                 fd            Z xZS )BlipTextAttentionFNc                     t                                                       t          |||          | _        t	          |          | _        t                      | _        d S )Nr^   )r&   r'   rS   r;   r   outputsetpruned_headsrf   s       r=   r'   zBlipTextAttention.__init__  sR    )&2DPYZZZ	(00EEr>   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r;   rY   r\   r   r   r`   rb   rc   r   r   r]   union)r;   headsindexs      r=   prune_headszBlipTextAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r>   rv   rw   rx   ry   r|   r}   r~   r   r   r   rB   c           	          |                      |||||||          }|                     |d         |          }	|	f|dd          z   }
|
S )Nr}   r~   r   rw   r   r   r   r   )r;   r   )r;   r|   r}   r~   r   rw   r   r   self_outputsattention_outputoutputss              r=   rH   zBlipTextAttention.forward  sf     yy)"7+/) ! 
 
  ;;|AFF#%QRR(88r>   )FN)NNNNFN)rI   rJ   rK   r'   r   r   r5   r   r   rN   r   r   r   rH   rP   rQ   s   @r=   r   r     s       " " " " " "; ; ;$ _%0A6RRR 7;15=A+/,115 | !!23 E-.	
  ((9: "% $D> !. 
u|	   SR    r>   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )BlipTextIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S ri   )r&   r'   r   r_   r*   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnr:   s     r=   r'   zBlipTextIntermediate.__init__9  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r>   r|   rB   c                 Z    |                      |          }|                     |          }|S ri   )r   r   r;   r|   s     r=   rH   zBlipTextIntermediate.forwardA  s,    

=1100??r>   r   rQ   s   @r=   r   r   8  s^        9 9 9 9 9U\ el        r>   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )BlipTextOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r&   r'   r   r_   r   r*   r   r/   r0   r1   r2   r3   r:   s     r=   r'   zBlipTextOutput.__init__I  sf    Yv79KLL
f&8f>STTTz&"<==r>   r|   r   rB   c                     |                      |          }|                     |          }|                     ||z             }|S ri   r   r   s      r=   rH   zBlipTextOutput.forwardO  r   r>   r   rQ   s   @r=   r   r   H  r   r>   r   c                   .    e Zd Z fdZ eddd          	 	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 deej                 dee	         dee
         deej                 deej                 fd            Zd Z xZS )BlipTextLayerc                 `   t                                                       || _        |j        | _        d| _        t          ||          | _        || _        | j        j        r!t          || j        j        |          | _	        t          |          | _        t          |          | _        d S )Nr   r   )rg   r^   )r&   r'   r9   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r;   r9   r   r<   s      r=   r'   zBlipTextLayer.__init__W  s    '-'E$*6YGGG";! 	"34;+AY# # #D 188$V,,r>   rv   rw   rx   ry   NFr|   r}   r~   r   r   r   r   rB   c	           	         |                      ||||||          }	|	d         }
|	dd          }|1|                     |
||||||          }|d         }
||dd          z   }t          | j        | j        | j        |
          }|f|z   S )N)r}   r~   r   rw   r   r   r   r   )r   r   r   feed_forward_chunkr   r   )r;   r|   r}   r~   r   r   rw   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 r=   rH   zBlipTextLayer.forwarde  s     "&)/+) "0 "
 "
 2!4(, ,&*&9&9 5#&; /"3- ': ' '#  7q9 7 ;;G0#T%A4CSUe
 
 ((r>   c                 \    |                      |          }|                     ||          }|S ri   )r   r   )r;   r   intermediate_outputr   s       r=   r   z BlipTextLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr>   r   )rI   rJ   rK   r'   r   r5   r   r   rN   r   r   r   rH   r   rP   rQ   s   @r=   r   r   V  s)       - - - - - _%0A6RRR 7;15=A>B+/,115%) %)|%) !!23%) E-.	%)
  ((9:%) !)): ;%) "%%) $D>%) !.%) 
u|	%) %) %) SR%)N      r>   r   c                   F    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee         dee	         dee	         dee	         dee	         deej                 de
eej                 ef         fdZ xZS )BlipTextEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 0    g | ]}t          |          S  )r   ).0ir9   s     r=   
<listcomp>z,BlipTextEncoder.__init__.<locals>.<listcomp>  s#    #f#f#fM&!$<$<#f#f#fr>   F)	r&   r'   r9   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr:   s    `r=   r'   zBlipTextEncoder.__init__  s`    ]#f#f#f#feFLdFeFe#f#f#fgg
&+###r>   NFTr|   r}   r~   r   r   rw   	use_cacher   output_hidden_statesreturn_dictr   rB   c                 N   | j         r%| j        r|rt                              d           d}|rt	          |t
                    r/t                              d           t          j        |          }nqt	          |t                    r$t          |t          | j
                            }n8|6t          t          | j
                  t          | j
                            }|	rdnd }|rdnd }|r|dnd }t          | j
        j                  D ]Y}| j        |         }|	r||fz   }|||         nd } |||||||||          }|d         }|r||d         fz   }|||d         fz   }Z|	r||fz   }|
st          d	 |||||fD                       S t          |||||
          S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r9   r   r   r   rX   c              3      K   | ]}||V  	d S ri   r   )r   vs     r=   	<genexpr>z*BlipTextEncoder.forward.<locals>.<genexpr>  s4       
 
 =  !===
 
r>   )last_hidden_staterw   r|   
attentionscross_attentions)r   trainingloggerwarningr   r   warning_oncer   from_legacy_cacher   r9   r   r   r   r   )r;   r|   r}   r~   r   r   rw   r   r   r   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsr   layer_modulelayer_head_masklayer_outputss                      r=   rH   zBlipTextEncoder.forward  sp    & 	"4= 	" "p   "	 	/511 ##`  
 #6"G"X"X O\:: "5o|[_[fGgGgGg"h"h ("5 444l$+6V6V6V# # #7@BBD$5?bb4%6f;P;\rrbft{455 	V 	VA:a=L# I$58H$H!.7.CillO(L%&!	 	M *!,M  V&9]1=M<O&O#(4+?=QRCSBU+U( 	E 1]4D D 	 
 
 "#%'(
 
 
 
 
 
 9+++*1
 
 
 	
r>   )
NNNNNNFFTN)rI   rJ   rK   r'   r5   r   r   rN   r   r   r   r   r   rH   rP   rQ   s   @r=   r   r     sK       , , , , , 7;15=A>B+/$(,1/4&*15W
 W
|W
 !!23W
 E-.	W

  ((9:W
 !)): ;W
 "%W
 D>W
 $D>W
 'tnW
 d^W
 !.W
 
uU\"$MM	NW
 W
 W
 W
 W
 W
 W
 W
r>   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )BlipTextPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S ri   )r&   r'   r   r_   r*   r   Tanh
activationr:   s     r=   r'   zBlipTextPooler.__init__  sC    Yv163EFF
'))r>   r|   rB   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r  )r;   r|   first_token_tensorpooled_outputs       r=   rH   zBlipTextPooler.forward  s@     +111a40

#56666r>   r   rQ   s   @r=   r  r    s^        $ $ $ $ $
U\ el        r>   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )BlipTextPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r   )r&   r'   r   r_   r*   r   r   r   r   r
   transform_act_fnr/   r0   r:   s     r=   r'   z(BlipTextPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr>   r|   rB   c                     |                      |          }|                     |          }|                     |          }|S ri   )r   r  r/   r   s     r=   rH   z'BlipTextPredictionHeadTransform.forward  s=    

=11--m<<}55r>   r   rQ   s   @r=   r  r    sc        U U U U UU\ el        r>   r  c                   *     e Zd Z fdZd Zd Z xZS )BlipTextLMPredictionHeadc                 >   t                                                       t          |          | _        t	          j        |j        |j        d          | _        t	          j	        t          j        |j                            | _        | j        | j        _        d S )NF)bias)r&   r'   r  	transformr   r_   r*   r)   decoder	Parameterr5   zerosr"  r:   s     r=   r'   z!BlipTextLMPredictionHead.__init__  sz    8@@ y!3V5FUSSSLV->!?!?@@	 !Ir>   c                 (    | j         | j        _         d S ri   )r"  r$  rn   s    r=   _tie_weightsz%BlipTextLMPredictionHead._tie_weights&  s     Ir>   c                 Z    |                      |          }|                     |          }|S ri   )r#  r$  r   s     r=   rH   z BlipTextLMPredictionHead.forward)  s*    }55]33r>   )rI   rJ   rK   r'   r(  rH   rP   rQ   s   @r=   r   r     sV        & & & & && & &      r>   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )BlipTextOnlyMLMHeadc                 p    t                                                       t          |          | _        d S ri   )r&   r'   r   predictionsr:   s     r=   r'   zBlipTextOnlyMLMHead.__init__1  s/    3F;;r>   sequence_outputrB   c                 0    |                      |          }|S ri   )r-  )r;   r.  prediction_scoress      r=   rH   zBlipTextOnlyMLMHead.forward5  s     ,,_==  r>   r   rQ   s   @r=   r+  r+  0  s^        < < < < <!u| ! ! ! ! ! ! ! ! !r>   r+  c                   ,    e Zd ZU dZeed<   dZg Zd ZdS )BlipTextPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r9   bertc                    t          |t          j        t          j        f          r,|j        j                            d| j        j                   nWt          |t          j	                  r=|j
        j                                         |j        j                            d           t          |t          j                  r'|j
        "|j
        j                                         dS dS dS )zInitialize the weightsg        )meanstd      ?N)r   r   r_   r(   weightdatanormal_r9   initializer_ranger/   r"  zero_fill_)r;   modules     r=   _init_weightsz%BlipTextPreTrainedModel._init_weightsE  s    fry",788 	* M&&CT[5R&SSSS-- 	*K""$$$M$$S)))fbi(( 	%V[-DK""$$$$$	% 	%-D-Dr>   N)	rI   rJ   rK   rL   r   __annotations__base_model_prefix_no_split_modulesr?  r   r>   r=   r2  r2  ;  sI          
 
% 
% 
% 
% 
%r>   r2  c            #           e Zd ZdZd fd	Zd Zd Zd Zdede	e
         d	ed
edef
dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         dee         dee         d
ee         deej                 dee	ej                 ef         f dZ xZS )BlipTextModela&  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    Tc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd | _        | 	                                 d S ri   )
r&   r'   r9   r   rG   r   encoderr  pooler	post_init)r;   r9   add_pooling_layerr<   s      r=   r'   zBlipTextModel.__init__\  so       ,V44&v..0AKnV,,,tr>   c                     | j         j        S ri   rG   r,   rn   s    r=   get_input_embeddingsz"BlipTextModel.get_input_embeddingsf  s    ..r>   c                     || j         _        d S ri   rK  )r;   rc   s     r=   set_input_embeddingsz"BlipTextModel.set_input_embeddingsi  s    */'''r>   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrF  r   r   r   )r;   heads_to_pruner   r   s       r=   _prune_headszBlipTextModel._prune_headsm  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr>   r}   rE   r   r   rB   c                    |                                 dk    r|dddddddf         }n=|                                 dk    r	|r|\  }}t          j        ||          }|ddddf                             ||d          |ddddf         k    }	|	                    |j                  }	|	j        d         |j        d         k     rP|j        d         |	j        d         z
  }
t          j        t          j        |||
f||	j                  |	gd          }	|	dddddddf         |ddddddf         z  }n,|ddddddf         }nt          d	| d
|j         d          |                    | j                  }d|z
  dz  }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r	   NrX   r   r   )r   r   r"   )axisz!Wrong shape for input_ids (shape z) or attention_mask (shape )r   r7  g     )
r   r5   r6   repeatr   r   r   catonesr[   )r;   r}   rE   r   r   extended_attention_maskr   rF   seq_idscausal_maskprefix_seq_lens              r=   get_extended_attention_maskz)BlipTextModel.get_extended_attention_masku  s   & 1$$&4QQQaaa]&C##!!Q&&  K)4&
J,z&AAA%dD!!!m4;;J
TUVVZabfhihihikoboZpp)nn^-ABB$Q'.*>q*AAA%3%9!%<{?PQR?S%SN"')!J!+Z HQW_j_p   (	  # # #K +6aaaqqq!!!m*D~VWVWVWY]_cefefefVfGg*g''*8D$9I*J''sKss\j\psss   #:"<"<4:"<"N"N#&)@#@H"L&&r>   NFr?   r!   r~   r@   encoder_embedsr   r   rw   r   r   r   r   r   c                     ||n j         j        }||n j         j        }||n j         j        }|r|
|
n j         j        }
nd}
||t          d          |7                     ||           |                                }|\  }}|j        }ne|)|                                dd         }|\  }}|j        }n:|)|                                dd         }|\  }}|j        }nt          d          d}|	Bt          |	t                    s|	d         d         j        d         n|	                                }|,t          j        |||z   f                              |          }                     ||||          }|t          |t"                    r|d                                         \  }}}n|                                \  }}}||f}t          |t"                    r fd|D             }nF|,t          j        ||	          }                     |          }n                     |          }nd}                     | j         j                  }|                     ||||
          }n|}                     ||||||	|
||||          }|d         } j                             |          nd}|s||f|dd         z   S t1          |||j        |j        |j        |j                  S )a  
        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`Cache`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer"   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   r   c                 :    g | ]}                     |          S r   )invert_attention_mask)r   maskr;   s     r=   r   z)BlipTextModel.forward.<locals>.<listcomp>  s(    2w2w2wX\43M3Md3S3S2w2w2wr>   rT  )r?   r!   r@   rA   )
r}   r~   r   r   rw   r   r   r   r   r   r   )r  pooler_outputrw   r|   r  r  )r9   r   r   use_return_dictr   r[   %warn_if_padding_and_no_attention_maskrD   r   r   r   r   get_seq_lengthr5   rY  r   r^  listrb  get_head_maskr   rG   rF  rG  r   rw   r|   r  r  )r;   r?   r}   r!   r~   r@   r_  r   r   rw   r   r   r   r   r   r   rE   r   rF   r   rA   rZ  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputsr.  r  s   `                              r=   rH   zBlipTextModel.forward  s   H 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] 	%.%:		@UIII ]%>cddd"66y.QQQ#..**K%0"J
%FF&',,..ss3K%0"J
")FF'(--//4K%0"J
#*FFfggg!"& "/5996"1%+B//$3355 # !"ZZBX5X(YZZ]]^deeN 150P0PK1
 1
 !,/66 ^AVWXAYA^A^A`A`>"$;QQAVA[A[A]A]>"$;Q$68O#P 0$77 e2w2w2w2w`v2w2w2w//'/).4HQW)X)X)X&262L2LMc2d2d//262L2LMc2d2d//.2+ &&y$+2OPP	!##)+'=	  /      .,,2"7#B+/!5#) ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-'+;)7&1,=
 
 
 	
r>   )T)NNNNNNNNNNNNNFN)rI   rJ   rK   rL   r'   rL  rN  rR  r   r   rO   r   r   r^  r   r5   r   r   r   rH   rP   rQ   s   @r=   rD  rD  S  s              / / /0 0 0C C C<'$<'38:<'GM<'[_<'	<' <' <' <'@ -115/3,004158<9=+/$(,0/3&*%*15!P
 P
EL)P
 !.P
 u|,	P

 EL)P
  -P
 !.P
  (5P
 !) 6P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 TNP
  !.!P
" 
uU\"$PP	Q#P
 P
 P
 P
 P
 P
 P
 P
r>   rD  c            '       
    e Zd ZddgZ fdZd Zd Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         de	e         de	e         de	e         de	e         de	e         de	e
j                 deee
j                 ef         f$dZd! fd	Z xZS )"BlipTextLMHeadModelzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t                                          |           t          |d          | _        t	          |          | _        |j        | _        d S )NF)rI  )r&   r'   rD  r3  r+  clslabel_smoothingr:   s     r=   r'   zBlipTextLMHeadModel.__init__J  sO       !&EBBB	&v..%5r>   c                 4    | j                                         S ri   )r3  rL  rn   s    r=   rL  z(BlipTextLMHeadModel.get_input_embeddingsQ  s    y--///r>   c                 :    | j                             |           d S ri   )r3  rN  r;   new_embeddingss     r=   rN  z(BlipTextLMHeadModel.set_input_embeddingsT  s    	&&~66666r>   c                 $    | j         j        j        S ri   )rs  r-  r$  rn   s    r=   get_output_embeddingsz)BlipTextLMHeadModel.get_output_embeddingsW  s    x#++r>   c                 T    || j         j        _        |j        | j         j        _        d S ri   )rs  r-  r$  r"  rw  s     r=   set_output_embeddingsz)BlipTextLMHeadModel.set_output_embeddingsZ  s%    '5$$2$7!!!r>   NFTr5  r?   r}   r!   r~   r@   r   r   labelsrw   r   r   r   r   return_logitsr   	reductionr   rB   c                 b   ||n| j         j        }|d}
|                     ||||||||	|
|||||          }|d         }|                     |          }|r#|ddddddf                                         S d}||ddddddf                                         }|ddddf                                                             |j                  }t          || j                  } ||	                    d| j         j
                  |	                    d                    }|dk    r<|	                    |                    d          d                              d          }|s|f|d	d         z   }||f|z   n|S t          |||j        |j        |j        |j        
          S )a  
        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
            configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        past_key_values (`Cache`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NF)r}   r!   r~   r@   r   r   rw   r   r   r   r   r   r   r   r"   r   )r  rt  nonerX   )losslogitsrw   r|   r  r  )r9   re  r3  rs  r   r   r   r   rt  r   r)   rD   sumr   rw   r|   r  r  )r;   r?   r}   r!   r~   r@   r   r   r}  rw   r   r   r   r   r~  r   r  r   r   r.  r0  lm_lossshifted_prediction_scoresloss_fctr   s                            r=   rH   zBlipTextLMHeadModel.forward^  s   T &1%<kk$+B]I)))%'"7#9+/!5#!)  
 
" "!* HH_55 	=$QQQQQQY/::<<<(9!!!SbS!!!)(D(O(O(Q(Q%AAAqrrE]--//223L3STTF')TMabbbHh8==b$+BXYY[a[f[fgi[j[jkkGF""!,,'8'='=a'@'@"EEII!LL 	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
r>   c                 L     t                      j        |f||d|}d|d<   |S )N)rw   r}   Tr   )r&   prepare_inputs_for_generation)r;   r?   rw   r}   model_kwargsmodel_inputsr<   s         r=   r  z1BlipTextLMHeadModel.prepare_inputs_for_generation  sL     =uww<
+)
 
 	
 
 &*\"r>   )NNNNNNNNNNNNNFTr5  N)NN)rI   rJ   rK   _tied_weights_keysr'   rL  rN  rz  r|  r   r5   r   r   r   r   r   r   r   rH   r  rP   rQ   s   @r=   rq  rq  G  s'       :<Z[6 6 6 6 60 0 07 7 7, , ,8 8 8 -115/3,0048<9=)-+/$(,0/3&*(-%)#)15%Z
 Z
EL)Z
 !.Z
 u|,	Z

 EL)Z
  -Z
  (5Z
 !) 6Z
 &Z
 "%Z
 D>Z
 $D>Z
 'tnZ
 d^Z
  ~Z
  TN!Z
" C=#Z
$ !.%Z
& 
uU\"$EE	F'Z
 Z
 Z
 Z
x         r>   rq  )rD  rq  r2  )8r   typingr   r   r5   r   r   r   torch.nnr   activationsr
   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   utils.deprecationr   configuration_blipr   
get_loggerrI   r  Moduler   rS   r   r   r   r   r   r   r  r  r   r+  r2  rD  rq  __all__r   r>   r=   <module>r     s-  "  " " " " " " " "  $ $ $ $ $ $ $ $ $ $ % % % % % % ! ! ! ! ! ! C C C C C C C C C C ) ) ) ) ) ) 9 9 9 9 9 9         
 . - - - - - l l l l l l l l l l       0 0 0 0 0 0 . . . . . . 
	H	%	%0 0 0 0 0 0 0 0hR. R. R. R. R.BI R. R. R.l       / / / / /	 / / /f    29        RY   : : : : :. : : :|^
 ^
 ^
 ^
 ^
bi ^
 ^
 ^
D    RY        bi   $    ry   0! ! ! ! !") ! ! !% % % % %o % % %0p
 p
 p
 p
 p
+ p
 p
 p
h~ ~ ~ ~ ~1? ~ ~ ~B N
M
Mr>   