
     `iL                    &   d Z ddlZddlmZ ddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+  e$j,        e-          Z.dZ/e e#d           G d de                                  Z0e e#d           G d de                                  Z1 G d de
j2                  Z3 G d de
j2                  Z4 G d d e
j2                  Z5 G d! d"e
j2                  Z6 G d# d$e
j2                  Z7 G d% d&e
j2                  Z8 G d' d(e
j2                  Z9 G d) d*e
j2                  Z: G d+ d,e
j2                  Z; G d- d.e
j2                  Z<d/e<iZ= G d0 d1e
j2                  Z> G d2 d3e
j2                  Z? G d4 d5e          Z@ G d6 d7e
j2                  ZA G d8 d9e
j2                  ZBdWd:ZCe# G d; d<e                      ZD G d= d>eD          ZE e#d?           G d@ dAeD                      ZF e#dB           G dC dDeD                      ZG G dE dFe
j2                  ZH G dG dHe
j2                  ZI G dI dJe
j2                  ZJ e#dK           G dL dMeD                      ZK e#dN           G dO dPeD                      ZL G dQ dRe
j2                  ZM e#dS           G dT dUeD                      ZNg dVZOdS )XzPyTorch BridgeTower Model    N)OrderedDict)	dataclass)OptionalUnion)nn)CrossEntropyLoss   )ACT2FNQuickGELUActivation)CacheDynamicCacheEncoderDecoderCache)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int)deprecate_kwarg   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerz.
    Output type of [`BridgeTowerModel`].
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )BridgeTowerModelOutputa  
    text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
        Sequence of hidden-states at the text output of the last layer of the model.
    image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
        Sequence of hidden-states at the image output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
        Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
        token), respectively, after further processing through layers used for auxiliary pretraining tasks.
    Ntext_featuresimage_featurespooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&   r'   r(   tupler)        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr$   r$   0   s           26M8E-.55526NHU./66615M8E-.5558<M8E%"345<<<59Ju01299999r3   r$   z>
    Output type of ['BridgeTowerForContrastiveLearning']
    c                   L   e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dS )
BridgeTowerContrastiveOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Image-text contrastive loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
        The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    Nlosslogitstext_embedsimage_embedscross_embedsr(   r)   )r*   r+   r,   r-   r7   r   r.   r/   r0   r8   r9   r1   r:   r;   r(   r)   r2   r3   r4   r6   r6   H   s            )-D(5$
%,,,*.FHU&'...6:K% 123:::7;L(5!234;;;7;L(5!234;;;8<M8E%"345<<<59Ju01299999r3   r6   c                   t     e Zd Z fdZdej        dej        fdZddej        deej                 fdZ xZ	S )BridgeTowerResidualAttentionc                 .   t                                                       t          j        |j        |j        dz            | _        t          j        |j        |j                  | _        t          j	        t          dt          j        |j        |j        dz            fdt                      fdt          j        |j        dz  |j                  fg                    | _        t          j        |j        |j                  | _        d | _        d S )N@   epsc_fc   geluc_proj)super__init__r   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r4   rG   z%BridgeTowerResidualAttention.__init__i   s    )&*<f>PTV>VWW	L!39NOOO	=RYv'96;MPQ;QRRS0223ry);a)?ASTTU 
 
 L!39NOOO	r3   hidden_stateattention_maskc                    |&|                     t          j        |j                  }| j        &| j                             |j        |j                  nd | _        |                     |||d| j        |          d         S )NdtypedeviceF)need_weightsrR   key_padding_maskr   )tor.   boolr\   rR   r[   rJ   )rT   rW   rX   s      r4   	attentionz&BridgeTowerResidualAttention.attentionz   s    %+..UZH[.\\N ~) NL$6|?RSSS 	
 yyn+  
 
  	r3   Nc                     ||                      |                     |          |          z   }|                     |          }| j                                        D ]} ||          }||z   }|S N)ra   rM   rQ   rP   values)rT   rW   rX   residual_statelayers        r4   forwardz$BridgeTowerResidualAttention.forward   st    %tyy7N7NP^(_(__yy00X__&& 	/ 	/E 5..LL%4r3   rc   )
r*   r+   r,   rG   r.   Tensorra   r   rg   __classcell__rV   s   @r4   r=   r=   h   s            "el EL    " EL (5<BX        r3   r=   c                   P     e Zd Z fdZddej        deej                 fdZ xZS )BridgeTowerTransformerc                    t                                                       j        | _        j        | _        j        r;t          j        fdt          | j        dz
            D                       | _        n7t          j        fdt          | j                  D                       | _        j	        | _	        d S )Nc                 .    g | ]}t                    S r2   r=   .0_rU   s     r4   
<listcomp>z3BridgeTowerTransformer.__init__.<locals>.<listcomp>   s"    aaa!-f55aaar3   r   c                 .    g | ]}t                    S r2   ro   rp   s     r4   rs   z3BridgeTowerTransformer.__init__.<locals>.<listcomp>   s"    ]]]!-f55]]]r3   )
rF   rG   rI   num_hidden_layersremove_last_layerr   
ModuleListrange	resblocksstop_gradientrS   s    `r4   rG   zBridgeTowerTransformer.__init__   s    !-!'!9# 	]aaaauTE[^_E_?`?`aaa DNN  ]]]]]uTE[?\?\]]] DN $1r3   NrW   rX   c                     g }| j         D ]R} |||          }| j        r(|                    |                                           =|                    |           S|S rc   )ry   rz   appenddetach)rT   rW   rX   r(   blocks        r4   rg   zBridgeTowerTransformer.forward   st    ^ 	3 	3E 5~>>L! 3$$\%8%8%:%:;;;;$$\2222r3   rc   )	r*   r+   r,   rG   r.   rh   r   rg   ri   rj   s   @r4   rl   rl      sh        2 2 2 2 2 EL (5<BX        r3   rl   c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )BridgeTowerVisionEmbeddingsrU   c                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)rF   rG   rU   rI   	embed_dim
image_size
patch_sizer   	Parameterr.   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandrS   s     r4   rG   z$BridgeTowerVisionEmbeddings.__init__   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr3   
embeddingsheightwidthreturnc                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr         ?r	   r   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer.   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolateviewcat)rT   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r4   interpolate_pos_encodingz4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr3   Fpixel_valuesc                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model (z).r[   r   r   r   r   )r   r   
ValueErrorr   r   r[   r_   flatten	transposer   r   r.   r   r   r   r   )rT   r   r   
batch_sizerr   r   r   target_dtypepatch_embedsclass_embedsr   s              r4   rg   z#BridgeTowerVisionEmbeddings.forward   sD   '3'9$
Avu' 	Vt-F-F%SWSbJbJbqVqqeqqDOqq^b^mqqq   +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr3   F)r*   r+   r,   r    rG   r.   rh   intr   r/   rg   ri   rj   s   @r4   r   r      s        q6 q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Z_Zf        r3   r   c                   p     e Zd Z fdZ	 d	dej        defdZ	 d	dej        defdZdej        fdZ	 xZ
S )
BridgeTowerVisionTransformerc                    t                                                       t                    | _        t	          j        j        j                  | _        t                    | _
        t	          j        j        j                  | _        j        | _        j        s9t	          j        fdt          j                  D                       | _        d S d S )Nr@   c                 P    g | ]"}t          j        j        j                   #S )r@   )r   rK   rI   rL   rp   s     r4   rs   z9BridgeTowerVisionTransformer.__init__.<locals>.<listcomp>  s/    vvvQRf0f6KLLLvvvr3   )rF   rG   r   r   r   rK   rI   rL   ln_prerl   transformerln_postshare_layernormrw   rx   ru   ln_separaterS   s    `r4   rG   z%BridgeTowerVisionTransformer.__init__  s    5f==l6#56;PQQQ1&99|F$6F<QRRR%5% 	!}vvvvV[\b\tVuVuvvv   D	 	r3   Fr   r   c                    |                      ||          }|                     |          }|                    ddd          }|                     ||          }t	          j        |d          }|                    dddd          }| j        r|                     |          }nSg }t          || j	                  D ]%\  }} ||          }|
                    |           &t	          j        |d          }|S )Nr   r   r   r   r	   )r   r   r   r   r.   stackr   r   zipr   r|   )rT   r   rX   r   r(   hidden_states_stacklns          r4   rg   z$BridgeTowerVisionTransformer.forward  s	    6NOOM22%--aA66((GGMq999%--aAq99 	D LL77MM"$%(8H%I%I : :!r "= 1 1#**=9999!K(;CCCMr3   c                     |                      ||          }|                     |          }|                    ddd          }|S )Nr   r   r   r   )r   r   r   )rT   r   r   r(   s       r4   forward_prez(BridgeTowerVisionTransformer.forward_pre+  sH    
 OghhM22%--aA66r3   rW   c                 ^    |                     ddd          }|                     |          }|S )Nr   r   r   )r   r   )rT   rW   visual_output_posts      r4   forward_postz)BridgeTowerVisionTransformer.forward_post6  s3    )11!Q::!\\*<==!!r3   r   )r*   r+   r,   rG   r.   rh   r`   rg   r   r   ri   rj   s   @r4   r   r     s            " */	 l #'	   < */	 	l	 #'	 	 	 	" " " " " " " " "r3   r   c                   $     e Zd Z fdZd Z xZS )BridgeTowerLinkTowerc                    t                                                       |j        | _        |j        | _        |j        dv r|j        dk    r,t	          j        t          j        d                    | _        n6|j        dk    r+t	          j        t          j        d                    | _	        t	          j
        | j        |j                  | _
        d S t          d|j         d          )	N)add
scaled_addr   r         ?r   r   r@   link_tower_type  is not implemented)rF   rG   link_tower_typerI   r   r   r.   tensorscaled_factorbetarK   rL   NotImplementedErrorrS   s     r4   rG   zBridgeTowerLinkTower.__init__=  s    %5!-!%III%55%'\%,s2C2C%D%D""'=88Lc):):;;	\$*:@UVVVDNNN%&d9O&d&d&deeer3   c                 :   | j         dk    r|                     ||z             S | j         dk    r |                     || j        z  |z             S | j         dk    r+|                     |d| j        z
  z  || j        z  z             S t	          d| j          d          )Nr   r   r   r   r   r   )r   rK   r   r   r   )rT   r(   cross_modal_hidden_statesrX   s       r4   rg   zBridgeTowerLinkTower.forwardJ  s    5((>>-2K"KLLL!\11>>-$2D"DG`"`aaa!]22>>-1ty="AD]`d`iDi"ijjj%&b9M&b&b&bcccr3   r*   r+   r,   rG   rg   ri   rj   s   @r4   r   r   <  sS        f f f f fd d d d d d dr3   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )BridgeTowerSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr@   )rF   rG   r   rO   rI   denserK   rL   Dropouthidden_dropout_probdropoutrS   s     r4   rG   zBridgeTowerSelfOutput.__init__W  sf    Yv163EFF
f&8f>STTTz&"<==r3   r(   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S rc   r   r   rK   rT   r(   r   s      r4   rg   zBridgeTowerSelfOutput.forward]  @    

=11]33}|'CDDr3   r*   r+   r,   rG   r.   rh   rg   ri   rj   s   @r4   r   r   V  i        > > > > >U\  RWR^        r3   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )BridgeTowerIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rc   )rF   rG   r   rO   rI   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrS   s     r4   rG   z BridgeTowerIntermediate.__init__f  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r3   r(   r   c                 Z    |                      |          }|                     |          }|S rc   )r   r  rT   r(   s     r4   rg   zBridgeTowerIntermediate.forwardn  s,    

=1100??r3   r   rj   s   @r4   r   r   e  s^        9 9 9 9 9U\ el        r3   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )BridgeTowerOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )rF   rG   r   rO   r   rI   r   rK   rL   r   r   r   rS   s     r4   rG   zBridgeTowerOutput.__init__v  sf    Yv79KLL
f&8f>STTTz&"<==r3   r(   r   r   c                     |                      |          }|                     |          }|                     ||z             }|S rc   r   r   s      r4   rg   zBridgeTowerOutput.forward|  r   r3   r   rj   s   @r4   r  r  u  r   r3   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )BridgeTowerPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S rc   )rF   rG   r   rO   rI   r   Tanh
activationrS   s     r4   rG   zBridgeTowerPooler.__init__  sC    Yv163EFF
'))r3   r(   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r  )rT   r(   first_token_tensorpooled_outputs       r4   rg   zBridgeTowerPooler.forward  s@     +111a40

#56666r3   r   rj   s   @r4   r	  r	    s^        $ $ $ $ $
U\ el        r3   r	  c                       e Zd Zd fd	Z eddd          	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 dee	         dee
         deej                 deej                 fd            Z xZS )BridgeTowerSelfAttentionNc                 R   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r6|j        | _        t          j        d
|j        z  dz
  | j                  | _        |j        | _        || _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()position_embedding_typeabsoluterelative_keyrelative_key_queryr   r   )rF   rG   rI   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   rO   querykeyvaluer   attention_probs_dropout_probr   getattrr  max_position_embeddingsr   distance_embedding
is_decoder	layer_idxrT   rU   r  r%  rV   s       r4   rG   z!BridgeTowerSelfAttention.__init__  s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD# +"r3   past_key_valuepast_key_values4.58new_nameversionFr(   rX   	head_maskencoder_hidden_statesoutput_attentionscache_positionr   c                    |j         \  }}	}
|                     |          }|                    |d| j        | j                                      dd          }d}|d u}|Ht          |t                    r1|j        	                    | j
                  }|r|j        }n
|j        }n|}|r|n|}|r3|1|r/|j        | j
                 j        }|j        | j
                 j        }n|                     |          }|                    |d| j        | j                                      dd          }|                     |          }|                    |d| j        | j                                      dd          }|N|s|nd }|                    ||| j
        d|i          \  }}|r$t          |t                    rd|j        | j
        <   t'          j        ||                    dd                    }| j        dk    s| j        d	k    rt|j         d         |j         d         }}|>t'          j        |dz
  t&          j        |j        
                              dd          }n:t'          j        |t&          j        |j        
                              dd          }t'          j        |t&          j        |j        
                              dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j        dk    rt'          j        d||          }||z   }n?| j        d	k    r4t'          j        d||          }t'          j        d||          }||z   |z   }|t?          j         | j                  z  }|||z   }tB          j"        #                    |d          }| $                    |          }|||z  }t'          j        ||          }|%                    dddd          &                                }|'                                d d         | j(        fz   }|                    |          }||fS )Nr   r   r   Fr0  Tr  r  rZ   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r   r	   ))r   r  r   r  r  r   r   r   
is_updatedgetr%  cross_attention_cacheself_attention_cachelayerskeysrd   r  r  updater.   matmulr  r   longr\   r   r#  r"  r_   r[   einsummathsqrtr   r   softmaxr   r   
contiguousr   r  )rT   r(   rX   r-  r.  r(  r/  r0  r   
seq_lengthrr   query_layerr3  is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r4   rg   z BridgeTowerSelfAttention.forward  s    %2$7!
Jjj//!&&z2t7OQUQijjttq
 
 
2$>&/+>?? 6,7;;DNKK
% O*9*O''*9*N''&5#2DW..- 	F/"=*"=+24>BGI-4T^DKKK00I!z2t7OQUQijjtt1 I **^44K%**B 8$:R i1oo  *7I!St)<)C)C{DN=M~<^* *&	; & F*_FY*Z*Z FAEO.t~> !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L*!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCo--r3   NNNNNNFN)r*   r+   r,   rG   r   r.   rh   r   r/   r   r`   r1   rg   ri   rj   s   @r4   r  r    s       # # # # # #6 _%0A6RRR 7;15=A+/,115e. e.|e. !!23e. E-.	e.
  ((9:e. "%e. $D>e. !.e. 
u|	e. e. e. SRe. e. e. e. e.r3   r  eagerc                       e Zd Zd fd	Zd Z eddd          	 	 	 	 	 	 dd	ej        d
eej	                 deej	                 deej	                 dee
         dee         deej                 deej                 fd            Z xZS )BridgeTowerAttentionNc                     t                                                       t          |j                 |||          | _        t          |          | _        t                      | _        d S )Nr  r%  )	rF   rG   #BRIDGE_TOWER_SELF_ATTENTION_CLASSES_attn_implementationrT   r   outputsetpruned_headsr&  s       r4   rG   zBridgeTowerAttention.__init__   sc    78ST$;
 
 
	
 ,F33EEr3   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   rT   r  r  r`  r   r  r  r  r^  r   r  union)rT   headsindexs      r4   prune_headsz BridgeTowerAttention.prune_heads*  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r3   r'  r(  r)  r*  Fr(   rX   r-  r.  r/  r0  r   c           	          |                      |||||||          }|                     |d         |          }	|	f|dd          z   }
|
S )NrX   r-  r.  r(  r/  r0  r   r   )rT   r^  )rT   r(   rX   r-  r.  r(  r/  r0  self_outputsattention_outputoutputss              r4   rg   zBridgeTowerAttention.forward<  sf     yy)"7+/) ! 
 
  ;;|AFF#%QRR(88r3   rU  rV  )r*   r+   r,   rG   rf  r   r.   rh   r   r/   r   r`   r1   rg   ri   rj   s   @r4   rY  rY    s       " " " " " "; ; ;$ _%0A6RRR 7;15=A+/,115 | !!23 E-.	
  ((9: "% $D> !. 
u|	   SR    r3   rY  c                   b     e Zd Zd
 fd	Z eddd          	 	 	 	 	 	 dd            Zd	 Z xZS )BridgeTowerBertCrossLayerNc                 F   t                                                       |j        | _        d| _        t	          ||          | _        |j        | _        |j        | _        t	          ||          | _        t          |          | _
        t          |          | _        d S )Nr   r%  )rF   rG   chunk_size_feed_forwardseq_len_dimrY  ra   r$  add_cross_attentioncrossattentionr   intermediater  r^  rT   rU   r%  rV   s      r4   rG   z"BridgeTowerBertCrossLayer.__init__V  s    '-'E$-f	JJJ +#)#= 26YOOO3F;;'//r3   r'  r(  r)  r*  Fc	           	         |                      ||d |d           }	|	d         }
|	dd          }|                     |
||||||          }|d         }
||dd          z   }t          | j        | j        | j        |
          }|f|z   }|S )N)rX   r-  r/  r(  r   r   rh  )ra   rs  r   feed_forward_chunkrp  rq  )rT   r(   r.  rX   r-  encoder_attention_maskr(  r/  r0  self_attention_outputsrj  rk  cross_attention_outputslayer_outputs                 r4   rg   z!BridgeTowerBertCrossLayer.forwarda  s     "&)/  "0 "
 "
 2!4 ),"&"5"51"7+/) #6 #
 #
 3153ABB770#T%A4CSUe
 
  /G+r3   c                 \    |                      |          }|                     ||          }|S rc   rt  r^  rT   rj  intermediate_outputr{  s       r4   rw  z,BridgeTowerBertCrossLayer.feed_forward_chunk  2    "//0@AA{{#68HIIr3   rc   rV  )r*   r+   r,   rG   r   rg   rw  ri   rj   s   @r4   rm  rm  U  s        	0 	0 	0 	0 	0 	0 _%0A6RRR
 #+ + + SR+Z      r3   rm  c                   0    e Zd Zd fd	Z eddd          	 	 	 	 	 	 	 ddej        d	eej                 d
eej                 deej                 deej                 dee	         dee
         deej                 deej                 fd            Zd Z xZS )BridgeTowerTextLayerNc                    t                                                       |j        | _        d| _        t	          ||          | _        |j        | _        |j        | _        | j        r0| j        st          |  d          t	          |d|          | _	        t          |          | _        t          |          | _        d S )Nr   ro  z> should be used as a decoder model if cross attention is addedr  r[  )rF   rG   rp  rq  rY  ra   r$  rr  r   rs  r   rt  r  r^  ru  s      r4   rG   zBridgeTowerTextLayer.__init__  s    '-'E$-f	JJJ +#)#= # 	x? j D!h!h!hiii"6vWamv"w"w"wD3F;;'//r3   r'  r(  r)  r*  Fr(   rX   r-  r.  rx  r/  r0  r   c	           	         |                      ||||||          }	|	d         }
| j        r|	dd         }n
|	dd          }| j        rV|Tt          | d          st          d|  d          |                     |
||||||          }|d         }
||dd         z   }t          | j        | j        | j        |
          }|f|z   S )	N)rX   r-  r/  r(  r0  r   r   r   rs  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`rh  )	ra   r$  r  r   rs  r   rw  rp  rq  )rT   r(   rX   r-  r.  rx  r(  r/  r0  ry  rj  rk  rz  r{  s                 r4   rg   zBridgeTowerTextLayer.forward  s9    "&)/+) "0 "
 "
 2!4 ? 	1,QrT2GG,QRR0G? 	>4@4!122  Dd D D D  
 '+&9&9 5#&; /"3- ': ' '#  7q9 7" ==G0#T%A4CSUe
 
 ((r3   c                 \    |                      |          }|                     ||          }|S rc   r}  r~  s       r4   rw  z'BridgeTowerTextLayer.feed_forward_chunk  r  r3   rc   )NNNNNFN)r*   r+   r,   rG   r   r.   rh   r   r/   r   r`   r1   rg   rw  ri   rj   s   @r4   r  r    s.       0 0 0 0 0 0 _%0A6RRR 7;15=A>B+/,1152) 2)|2) !!232) E-.	2)
  ((9:2) !)): ;2) "%2) $D>2) !.2) 
u|	2) 2) 2) SR2)h      r3   r  c                   H    e Zd Zd fd	Z	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee         dee	         dee	         dee	         dee	         deej                 de
eej                 ef         fdZ xZS )BridgeTowerTextEncoderNc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 2    g | ]}t          |           S ro  )r  )rq   irU   s     r4   rs   z3BridgeTowerTextEncoder.__init__.<locals>.<listcomp>  s'    ```1!&A666```r3   F)	rF   rG   rU   r   rw   rx   ru   rf   gradient_checkpointingru  s    ` r4   rG   zBridgeTowerTextEncoder.__init__  sf    ]````fF^@_@_```
 

 ',###r3   FTr(   rX   r-  r.  rx  r(  	use_cacher/  output_hidden_statesreturn_dictr0  r   c                    |	rdnd }|rdnd }|r| j         j        rdnd }| j        r%| j        r|rt                              d           d}|rD| j         j        r8|6t          t          | j                   t          | j                             }|rO| j         j        rCt          |t                    r.t                              d           t          j        |          }t          | j                  D ]Z\  }}|	r||fz   }|||         nd } |||||||||          }|d         }|r$||d         fz   }| j         j        r||d	         fz   }[|	r||fz   }|
st          d
 |||||fD                       S t          |||||          S )Nr2   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rU   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)rx  r(  r/  r0  r   r   r   c              3      K   | ]}||V  	d S rc   r2   rq   vs     r4   	<genexpr>z1BridgeTowerTextEncoder.forward.<locals>.<genexpr>(  s4       
 
 =  !===
 
r3   )last_hidden_stater(  r(   r)   cross_attentions)rU   rr  r  trainingloggerwarning_oncer$  r   r   r   r1   from_legacy_cache	enumeraterf   r   )rT   r(   rX   r-  r.  rx  r(  r  r/  r  r  r0  all_hidden_statesall_self_attentionsall_cross_attentionsr  layer_modulelayer_head_masklayer_outputss                      r4   rg   zBridgeTowerTextEncoder.forward  sc    #7@BBD$5?bb4%6d4;;Zdrr`d& 	"4= 	" "##p   "	 	v/ 	vO4K1,dk2R2R2RT`hlhsTtTtTtuuO 	U/ 	UJPU4V4V 	U\  
 2COTTO(44 	V 	VOA|# I$58H$H!.7.CillO(L%'= /"3-	 	 	M *!,M  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	 
 
 "#%'(
 
 
 
 
 
 9+++*1
 
 
 	
r3   rc   )
NNNNNNFFTN)r*   r+   r,   rG   r.   rh   r   r/   r   r`   r   r1   r   rg   ri   rj   s   @r4   r  r    sP       , , , , , , 7;15=A>B+/$(,1/4&*15P
 P
|P
 !!23P
 E-.	P

  ((9:P
 !)): ;P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 !.P
 
uU\"$MM	NP
 P
 P
 P
 P
 P
 P
 P
r3   r  c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )BridgeTowerTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 l   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           |j        | _        t          j        |j        |j        | j                  | _	        d S )N)padding_idxr@   r  r  r   r   Fr   token_type_idsr   )rF   rG   r   r   
vocab_sizerI   pad_token_idword_embeddingsr"  position_embeddingstype_vocab_sizetoken_type_embeddingsrK   rL   r   r   r   r!  r  r   r.   r   r   zerosr   r   r;  r  rS   s     r4   rG   z"BridgeTowerTextEmbeddings.__init__C  s}   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	

 ".#%<*F,>DL\$
 $
 $
   r3   Nr   c                    |.|t          || j        |          }n|                     |          }||                                }n|                                d d         }|d         }|mt	          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t          j	        | j
        j                  }||                     |          }|                     |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr   r   r  r   rZ   r  )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   r  r  r   r.   r  r;  r   r\   r  r  r  r  rK   r   )rT   	input_idsr  r   inputs_embedspast_key_values_lengthinput_shaperA  buffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s                r4   rg   z!BridgeTowerTextEmbeddings.forward\  s{    $A)TM]_uvv#JJ=YY #..**KK',,..ss3K ^

 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r3   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr   r   rZ   r   )r   r.   r   r  r;  r\   r   r   )rT   r  r  sequence_lengthr   s        r4   r  z@BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embeds  s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r3   )NNNNr   )r*   r+   r,   r-   rG   rg   r  ri   rj   s   @r4   r  r  =  sm         

 
 
 
 
4 rs& & & &P= = = = = = =r3   r  c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r.   cumsumtype_asr;  )r  r  r  maskincremental_indicess        r4   r  r    sg     <<$$((**D <!444<<TBBE[[_cc##%%33r3   c                   D    e Zd ZU eed<   dZdZddgZdZde	j
        fdZd	S )
BridgeTowerPreTrainedModelrU   bridgetowerFr  r=   r(  modulec                    | j         j        }t          |t                    r| j         j        dz  d| j         j        z  dz  z  }| j         j        dz  }d| j         j        z  dz  }|j        j        D ]}t          j	        
                    |j        j        ||z             |j        j        j                                         t          j	        
                    |j        j        j        ||z             t          j	        
                    |j        j        j        ||z             t          j	        
                    |j        j        j        ||z             t          j	        
                    |j        j        ||z             t          j	        
                    |j        j        j        ||z             nt          |t          j        t          j        t          j        f          r%|j        j        
                    dd|z             nt          |t          j                  r>|j        j                                         |j        j                            d           n>t          |t<                    r)|j        j                            | j         j                    t          |t          j        tB          f          r'|j        "|j        j                                         d S d S d S )Ng      r   )stdg        g?)meanr  r   )"rU   initializer_factorr   r   rI   ru   r   ry   r   initnormal_rJ   in_proj_weightin_proj_biasdatazero_out_projr   rP   rB   rE   r   r   r   rO   r   r   rK   r   fill_!BridgeTowerForContrastiveLearninglogit_scalelogit_scale_init_valueBridgeTowerMLMHead)rT   r  r  proj_stdattn_stdfc_stdr~   s          r4   _init_weightsz(BridgeTowerPreTrainedModel._init_weights  s   k,f:;; 	N/51t{?\;\ae:efH{.4H$+11d:F+5 M M
 9x#~NNN
',22444
 3 :3OOO	 56C<HHH	 0 7X^LLLLGOOF-=8c>ORRRGOOF-@GXX[^O\\\\BIr| DEE 	NM&&CTCZ&@@@@-- 	NK""$$$M$$S)))) ABB 	N#))$+*LMMMfry*<=>> 	%6;CZK""$$$$$	% 	%CZCZr3   N)r*   r+   r,   r   r0   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr   Moduler  r2   r3   r4   r  r    s]         %&+#35ST"3%BI % % % % % %r3   r  c                   H     e Zd ZU eed<    fdZed             ZddZ xZ	S )BridgeTowerVisionModelrU   c                 r    t                                          |           t          |          | _        d S rc   )rF   rG   r   visualrS   s     r4   rG   zBridgeTowerVisionModel.__init__  s.       26::r3   c                 8    | j         j        j        j        j        S rc   )r  r   r   r   r[   rT   s    r4   r[   zBridgeTowerVisionModel.dtype  s    {%5<BBr3   NFc                 `    |                      |                    | j                  ||          S rc   )r  typer[   )rT   image
image_maskr   s       r4   rg   zBridgeTowerVisionModel.forward  s'    {{5::dj11:?WXXXr3   )NF)
r*   r+   r,   r    r0   rG   propertyr[   rg   ri   rj   s   @r4   r  r    s         ####; ; ; ; ; C C XCY Y Y Y Y Y Y Yr3   r  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c            "           e Zd ZU eed<   d fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
e         de
e         de
e         de
e         de
e         de
ej                 deeej                 ef         fd            Z xZS )BridgeTowerTextModelrU   Tc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        | 	                                 dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rF   rG   rU   r  r   r  encoderr	  pooler	post_init)rT   rU   add_pooling_layerrV   s      r4   rG   zBridgeTowerTextModel.__init__  st    
 	   3F;;-f553DN'///$ 	r3   c                     | j         j        S rc   r   r  r  s    r4   get_input_embeddingsz)BridgeTowerTextModel.get_input_embeddings  s    ..r3   c                     || j         _        d S rc   r  rT   r  s     r4   set_input_embeddingsz)BridgeTowerTextModel.set_input_embeddings  s    */'''r3   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rf   ra   rf  )rT   heads_to_prunerf   rd  s       r4   _prune_headsz!BridgeTowerTextModel._prune_heads   sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr3   Nr  rX   r  r   r-  r  r.  rx  r(  r  r/  r  r  r0  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }| j         j        r|
|
n| j         j        }
nd}
||t          d          |+|                     ||           |                                }n.||                                d d         }nt          d          |\  }}||j	        n|j	        }d}|	Bt          |	t                    s|	d         d         j        d         n|	                                }|t          j        |||z   f|          }|gt!          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |t          j        |	          }|                     ||          }| j         j        rL|J|                                \  }}}||f}|t          j        ||          }|                     |          }nd }|                     || j         j                  }|                     |||||
          }|                     ||||||	|
||||          }|d         }| j        |                     |          nd }|s||f|dd          z   S t9          |||j        |j        |j        |j                   S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r2  r\   r  rZ   )r  r   r  r  r  )
rX   r-  r.  rx  r(  r  r/  r  r  r0  r   )r  r'   r(  r(   r)   r  )!rU   r/  r  use_return_dictr$  r  r   %warn_if_padding_and_no_attention_maskr   r\   r   r   r   get_seq_lengthr.   onesr  r   r  r   r  r;  get_extended_attention_maskinvert_attention_maskget_head_maskru   r  r  r   r(  r(   r)   r  ) rT   r  rX   r  r   r-  r  r.  rx  r(  r  r/  r  r  r0  r  r   rA  r\   r  r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthrr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                                    r4   rg   zBridgeTowerTextModel.forward  s{   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B];! 	%.%:		@UIII ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"& "/5996"1%+B//$3355 # !"Z*jCY6Y)ZdjkkkN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z 150P0PQ_al0m0m ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&.2.H.HI_.`.`++.2+ &&y$+2OPP	??%)'#9 + 
 
 ,,2"7#B+/!5#) ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-'+;)7&1,=
 
 
 	
r3   )T)NNNNNNNNNNNNNN)r*   r+   r,   r   r0   rG   r  r  r  r   r   r.   rh   r   r`   r   r1   r   rg   ri   rj   s   @r4   r  r    s         "!!!      / / /0 0 0C C C  -11515/3,0048<9=+/$(,0/3&*15s
 s
EL)s
 !.s
 !.	s

 u|,s
 EL)s
  -s
  (5s
 !) 6s
 "%s
 D>s
 $D>s
 'tns
 d^s
 !.s
  
uU\"$PP	Q!s
 s
 s
 ^s
 s
 s
 s
 s
r3   r  zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c            "           e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej	                 d	eej
                 d
eej	                 deej
                 deej
                 deej
                 dee         dee         dee         dee         deej	                 dedeeej                 ef         fd            Zd Z xZS )BridgeTowerModelc                    t                                                     | _        j        j        j        rIt          j        j        j                  | _	        t          j        j        j                  | _
        npt          j        fdt          j                  D                       | _	        t          j        fdt          j                  D                       | _
        t          j        dj                  | _        t!                    | _        t%                    | _        j        sej        r^| j        j        j        D ]L}| j        j        j        j        j        |j        _        | j        j        j        j        j        |j        _        Mt          j        fdt          j                  D                       | _        t          j        fdt          j                  D                       | _        t=                    | _        t=                    | _         t          j!        j        j"                  | _#        t          j!        j        j"                  | _$        j%        r)tM                    | _'        tM                    | _(        ntt          j        fdt          j        dz
            D                       | _'        t          j        fd	t          j        dz
            D                       | _(        | )                                 d S )
Nc                 N    g | ]!}t          j        j        j                  "S r2   r   rO   rI   )rq   rr   rU   text_configs     r4   rs   z-BridgeTowerModel.__init__.<locals>.<listcomp>  s+    qqqA;2F4FGGqqqr3   c                 N    g | ]!}t          j        j        j                  "S r2   r  )rq   rr   rU   vision_configs     r4   rs   z-BridgeTowerModel.__init__.<locals>.<listcomp>  s+    sssa=4f6HIIsssr3   r   c                 2    g | ]}t          |           S r  rm  rq   r  r  s     r4   rs   z-BridgeTowerModel.__init__.<locals>.<listcomp>  '    jjjQ&{a@@@jjjr3   c                 2    g | ]}t          |           S r  r  r  s     r4   rs   z-BridgeTowerModel.__init__.<locals>.<listcomp>  r  r3   r@   c                 .    g | ]}t                    S r2   r   rp   s     r4   rs   z-BridgeTowerModel.__init__.<locals>.<listcomp>  "    [[[!%f--[[[r3   r   c                 .    g | ]}t                    S r2   r  rp   s     r4   rs   z-BridgeTowerModel.__init__.<locals>.<listcomp>  r  r3   )*rF   rG   rU   r  r  $share_cross_modal_transformer_layersr   rO   rI   cross_modal_text_transformcross_modal_image_transformrw   rx   ru   r   r  r  vision_modelr  
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   r  r   cross_modal_image_layerscross_modal_text_layersr	  cross_modal_image_poolercross_modal_text_poolerrK   rL   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )rT   rU   r   r  r  rV   s    ` @@r4   rG   zBridgeTowerModel.__init__  s      ,(6 		.0i8OQWQc.d.dD+/1y9RTZTf/g/gD,,.0mqqqqqQVW]WoQpQpqqq/ /D+ 02}sssssSXY_YqSrSrsss0 0D, &(\!V5G%H%H"2=AA.{;;, 	J1Z 	J'.F J J!%!2!9!A!H!M	#07?DI(*jjjj%PVPhJiJijjj)
 )
% (*}jjjj%PVPhJiJijjj(
 (
$
 ):&(A(A%'8'@'@$ +-,v7IvOd*e*e*e'+-<8JPVPe+f+f+f() 		/CF/K/KD,0DV0L0LD--/1}[[[[uV=UXY=Y7Z7Z[[[0 0D, 13[[[[uV=UXY=Y7Z7Z[[[1 1D- 	r3   c                 4    | j                                         S rc   )r  r  r  s    r4   r  z%BridgeTowerModel.get_input_embeddings  s    33555r3   c                 :    | j                             |           d S rc   )r  r  r  s     r4   r  z%BridgeTowerModel.set_input_embeddings  s    ,,U33333r3   NFr  rX   r  r   
pixel_maskr-  r  r:   image_token_type_idxr/  r  r  labelsr   r   c                    |
|
n| j         j        }
||n| j         j        }|rdnd}|rdnd}|rdnd}|rdnd}|
rdnd}||t          d          ||n| j         j        }|	r|	nd}	|                                }| j                            |          }|r||fz  }|&t          j	        |t          j
        |j                  }| j                            ||                              |j                  }t          | j        j        j                  | j         j        z
  dz   }| j        j        j        d|         D ]} |||          d         }|r||fz  }|?| j        j                            |                    | j        j                  |          }n|                    ddd	          }|r||fz  }| j        j        j        j        d|         D ]} ||          }|r||fz  }| j        j                            |                    | j        j                            }|                     |          }|                     t          j        dt          j
        |j                                                |          }|                     ||z             }|                      |          }|                     t          j!        d
|	t          j
        |j                                                |          }||z   }| "                    |          }t          j	        |                    d          |                    d          ft          j
        |j                  }| j                            ||                                                              |j                  } | j#        d         |||||
          } | d         }! | j$        d         |||||
          }"|"d         }#|r||!|#ffz  }|
r|| d         |"d         ffz  }d}$tK          |t          | j        j        j                            D ]h}% | j        j        j        |%         ||          d         } | j        j        j        j        |%         |                              | j        j                  }|                      | j        j                            |                    |z   }| j&        |$         }&| j'        |$         }' |&|                     |          |z   |!|          }( |'||#|          }) | j#        |$dz            |(|)|||
          } | d         }! | j$        |$dz            |)|(|||
          }"|"d         }#|$dz  }$|r||fz  }||fz  }||!|#ffz  }|
r|| d         |"d         ffz  }j|!|#}+}*| (                    |*|+          },|r|||f}|stS          d |*|+|,||fD                       S tU          |*|+|,||          S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.
        output_hidden_states (`bool`, *optional*):
            If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and
            cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image,
            hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding
            modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and
            `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and
            `cross_modal_image_hidden_states` of each brdige layer.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerModel
        >>> from PIL import Image
        >>> import requests

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "hello world"
        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
        >>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> outputs.keys()
        odict_keys(['text_features', 'image_features', 'pooler_output'])
        ```Nr2   zYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r   )r  rZ   r   r   r   r   )rX   rx  r/  c              3      K   | ]}||V  	d S rc   r2   r  s     r4   r  z+BridgeTowerModel.forward.<locals>.<genexpr>  s0        =  === r3   )r%   r&   r'   r(   r)   )+rU   r/  r  r   r  r   r  r   r.   r  r;  r\   r  r_   rb  r  rf   ru   r  r  r   r  r[   r   r   ry   r   r  r  r  	expand_asr%  r  fullr&  r"  r!  rx   r(  r)  get_cls_featuresr1   r$   )-rT   r  rX   r  r   r,  r-  r  r:   r-  r/  r  r  r.  r   all_hidden_states_textall_hidden_states_imageall_hidden_states_crossr  r  r  r9   extend_text_maskssplit_indexrf   r~   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexr  text_link_towerimage_link_towercross_text_features_cross_image_features_r%   r&   cls_featuress-                                                r4   rg   zBridgeTowerModel.forward  s   j 2C1N--TXT_Tq$8$D  $+Jj 	 (<!E(<"F""$(<"F""$"6@BBD$5?bb4$):%k   &1%<kk$+B]7KR33QRnn&&o0090EE 	5"{n4"!"Z5:iN^___N OGGXcddgg
 

 $/17884;;XX[\\ _,2<K<@ 	9 	9E%->??BK# 9&;.8&,3??!!$"3"9::Um @  LL
 (//1a88L 	7#6# &-9CL[LQ 	; 	;E 5..L# ;'L?:'#07DD\EVEVW[WhWnEoEopp  ::;GG%)%?%?KI4DEEE&
 &

)$
%
% 	#  ::;KNh;hii#??@TUU&*&@&@Jt1IL\]]]'
 '

)(
)
) 	$  46QQ <<=QRRZ##A&&(9(>(>q(A(AB*#
 
 


 "_HHU_UdUdUfUfggjj
 
 =T9!<,#5/
 
 
 13>d;A>-#4/
 
 
  315 	V#)<>R(S'UU# 	V%7%:<OPQ<R$S#UU {C(?(E$F$FGG 0	Z 0	ZA:$/17:;HYZZ[\]KL4,3?I!L\ZZ__!' L 001B1I1V1VWc1d1dee-. !
 #>?OPO#@AQR $3?//<<?YY#!$ $ 
 %5$45IK_as$t$t! "T!=>NQR>R!S$%0'9"3" " " #5Q"7"U$"?@PST@T"U%$1'8"3# # # $7q#9 !# Z&;.8&'L?:''-@BV,W+YY'  Z#);A)>@STU@V(W'YY# )<=Q~,,]NKK 	k!79PRi j 	  'GXZmn      &')&+*
 
 
 	
r3   c                     |                      |          }|                     |          }t          j        ||gd          S )Nr   r   )r$  r#  r.   r   )rT   r%   r&   cls_features_textcls_features_images        r4   r4  z!BridgeTowerModel.get_cls_features  sF     88GG!::>JJy+-?@bIIIIr3   )NNNNNNNNNNNNNF)r*   r+   r,   rG   r  r  r   r   r.   
LongTensorr/   r   r`   r   r1   rh   r$   rg   r4  ri   rj   s   @r4   r
  r
    s       6 6 6 6 6p6 6 64 4 4  156:594815155948.2,0/3&*-1).j
 j
E,-j
 !!23j
 !!12	j

 u01j
 U-.j
 E-.j
   12j
 u01j
 'smj
 $D>j
 'tnj
 d^j
 )*j
 #'j
  
uU\"$::	;!j
 j
 j
 ^j
XJ J J J J J Jr3   r
  c                   $     e Zd Z fdZd Z xZS )"BridgeTowerPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r   )rF   rG   r   rO   rI   r   r   r   r   r
   transform_act_fnrK   rL   rS   s     r4   rG   z+BridgeTowerPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr3   c                     |                      |          }|                     |          }|                     |          }|S rc   )r   rQ  rK   r  s     r4   rg   z*BridgeTowerPredictionHeadTransform.forward  s=    

=11--m<<}55r3   r   rj   s   @r4   rO  rO    sL        U U U U U      r3   rO  c                   &     e Zd Zd fd	Zd Z xZS )r  Nc                 ^   t                                                       || _        t          |          | _        t          j        |j        |j        j	        d          | _
        t          j        t          j        |j        j	                            | _        ||| j
        _        d S d S )NF)r   )rF   rG   rU   rO  	transformr   rO   rI   r  r  decoderr   r.   r  r   r   )rT   rU   r   rV   s      r4   rG   zBridgeTowerMLMHead.__init__  s    ;FCCy!3V5G5RY^___LV-?-J!K!KLL	"(DL r3   c                 j    |                      |          }|                     |          | j        z   }|S rc   )rU  rV  r   )rT   x	mlm_scores      r4   rg   zBridgeTowerMLMHead.forward  s1    NN1%%	LL++di7	r3   rc   r   rj   s   @r4   r  r    sL        ) ) ) ) ) )      r3   r  c                   $     e Zd Z fdZd Z xZS )BridgeTowerITMHeadc                 |    t                                                       t          j        |d          | _        d S Nr   rF   rG   r   rO   fc)rT   rI   rV   s     r4   rG   zBridgeTowerITMHead.__init__  s0    )K++r3   c                 0    |                      |          }|S rc   r_  )rT   rX  	itm_scores      r4   rg   zBridgeTowerITMHead.forward  s    GGAJJ	r3   r   rj   s   @r4   r[  r[    sG        , , , , ,      r3   r[  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                       e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 dee	j
                 d	ee	j                 d
ee	j
                 dee	j                 dee	j                 dee	j                 dee         dee         dee         dee	j
                 deeee	j                 f         fd            Z xZS )BridgeTowerForMaskedLMzmlm_score.decoder.weightc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S rc   )rF   rG   r
  r  r  rY  r  rS   s     r4   rG   zBridgeTowerForMaskedLM.__init__  sR       +F33+F33 	r3   c                     | j         j        S rc   rY  rV  r  s    r4   get_output_embeddingsz,BridgeTowerForMaskedLM.get_output_embeddings  s    ~%%r3   c                     || j         _        d S rc   rg  )rT   new_embeddingss     r4   set_output_embeddingsz,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r3   Nr  rX   r  r   r,  r-  r  r:   r/  r  r  r.  r   c                    ||n| j         j        }|                     |||||||||	|
|          }|                     |r|j        n|d                   }d}|jt                      }|                    |j                  } ||                    d| j         j	        j
                  |                    d                    }|st          |          }||f|z   n|S t          |||j        |j                  S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        >>> text = "a <mask> looking out of the window"

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
        >>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

        >>> print(results)
        .a cat looking out of the window.
        ```N
rX   r  r   r,  r-  r  r:   r/  r  r  r   r   r7   r8   r(   r)   )rU   r  r  rY  r%   r   r_   r\   r   r  r  r1   r   r(   r)   )rT   r  rX   r  r   r,  r-  r  r:   r/  r  r  r.  rk  
mlm_logitsmasked_lm_lossloss_fctr^  s                     r4   rg   zBridgeTowerForMaskedLM.forward  s1   d &1%<kk$+B]""))%!'%/!5# # 
 
 ^^[$XG$9$9gVWjYY
'))HYYz011F%Xjoob$+:Q:\&]&]_e_j_jkm_n_nooN 	Z:&&F3A3M^%..SYY!/)	
 
 
 	
r3   NNNNNNNNNNNN)r*   r+   r,   _tied_weights_keysrG   rh  rk  r   r   r.   rM  r/   r`   r   r   r1   rg   ri   rj   s   @r4   rd  rd    s        55    & & &0 0 0  156:594815155948,0/3&*-1Q
 Q
E,-Q
 !!23Q
 !!12	Q

 u01Q
 U-.Q
 E-.Q
   12Q
 u01Q
 $D>Q
 'tnQ
 d^Q
 )*Q
 
~uU%677	8Q
 Q
 Q
 ^Q
 Q
 Q
 Q
 Q
r3   rd  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 dee	         dee	         dee	         deej                 de
eeej                 f         fd            Z xZS )#BridgeTowerForImageAndTextRetrievalc                     t                                          |           t          |          | _        t	          |j        dz            | _        |                                  d S r]  )rF   rG   r
  r  r[  rI   rb  r  rS   s     r4   rG   z,BridgeTowerForImageAndTextRetrieval.__init__U  sZ       +F33+F,>,BCC 	r3   Nr  rX   r  r   r,  r-  r  r:   r/  r  r  r.  r   c                    ||n| j         j        }|                     |||||||||	|
|          }|r|j        n|d         }|                     |          }d}|4t                      }|                    |j                  } |||          }|st          |          }||f|z   n|S t          |||j
        |j                  S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
        >>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, 1].item()
        ```Nrm  r   rn  )rU   r  r  r'   rb  r   r_   r\   r1   r   r(   r)   )rT   r  rX   r  r   r,  r-  r  r:   r/  r  r  r.  rk  r'   r8   itm_lossrq  r^  s                      r4   rg   z+BridgeTowerForImageAndTextRetrieval.forward_  s
   \ &1%<kk$+B]""))%!'%/!5# # 
 
 2=L--'!*..'))HYYv}--Fx//H 	N6]]F-5-AXK&((vM'!/)	
 
 
 	
r3   rr  )r*   r+   r,   rG   r   r   r.   rM  r/   r`   r   r   r1   rg   ri   rj   s   @r4   ru  ru  N  s             156:594815155948,0/3&*-1Q
 Q
E,-Q
 !!23Q
 !!12	Q

 u01Q
 U-.Q
 E-.Q
   12Q
 u01Q
 $D>Q
 'tnQ
 d^Q
 )*Q
 
'u/@)AA	BQ
 Q
 Q
 ^Q
 Q
 Q
 Q
 Q
r3   ru  c                   $     e Zd Z fdZd Z xZS )BridgeTowerContrastiveHeadc                 |    t                                                       t          j        ||          | _        d S rc   r^  )rT   rI   
embed_sizerV   s      r4   rG   z#BridgeTowerContrastiveHead.__init__  s0    )K44r3   c                 0    |                      |          }|S rc   ra  )rT   rX  s     r4   rg   z"BridgeTowerContrastiveHead.forward  s    GGAJJr3   r   rj   s   @r4   rz  rz    sG        5 5 5 5 5      r3   rz  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee	         dee	         dee	         dee	         de
eeej                 f         fd            Z xZS )r  c                    t                                          |           t          |          | _        t	          |j        |j                  | _        t	          |j        |j                  | _        t	          |j        dz  |j                  | _	        t          j        t          j        | j        j                            | _        |                                  d S r]  )rF   rG   r
  r  rz  rI   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr   r   r.   r   rU   r  r  r  rS   s     r4   rG   z*BridgeTowerForContrastiveLearning.__init__  s       +F3378JFLjkk89KVMkll$>v?QTU?UW]Wu$v$v!<T[5W(X(XYYr3   NTr  rX   r  r   r,  r-  r  r:   r/  r  r  return_lossr   c                 "   ||n| j         j        }|                     |||||||||	d|          }|r|j        n|d         }|r|j        n|d         \  }}}|d         }|d         }| j        j        j                            |          }| j                            t          j
        ddt          j        | j        j        j        j        	                                        |          }| j                            |          |z   }t           j                            |                     |ddd
ddf                   dd          }t           j                            |                     |ddd
ddf                   dd                              |j                  }t           j                            |                     |          dd                              |j                  }t          j        |||gd          }| j                                                            |j                  }t          j        ||                                          |z  }t          j        ||                                          |z  }t          j        ||                                          |z  }d}|rt          j        t;          |          |j                  }t           j                            ||          }t           j                            ||          }t           j                            ||          }||z   |z   dz  }|s||||f|dd         z   } ||f| z   n| S t?          ||||||j        |j                   S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
        >>> import requests
        >>> from PIL import Image
        >>> import torch

        >>> image_urls = [
        ...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
        ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
        ... ]
        >>> texts = ["two dogs in a car", "two cats sleeping on a couch"]
        >>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
        >>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

        >>> inputs = processor(images, texts, padding=True, return_tensors="pt")
        >>> loss = model(**inputs, return_loss=True).loss

        >>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
        >>> loss_swapped = model(**inputs, return_loss=True).loss

        >>> print("Loss", round(loss.item(), 4))
        Loss 0.0019

        >>> print("Loss with swapped images", round(loss_swapped.item(), 4))
        Loss with swapped images 2.126
        ```NTrm  r   r	   r   r0  r   rZ   r   )r   pr  r2  r   g      @)r7   r8   r9   r:   r;   r(   r)   )!rU   r  r  r'   r(   r  r  r   r  r.   r3  r;  r   r\   r2  r  r   r   	normalizer  r  r_   r  r   r  expr:  tr   rb  cross_entropyr6   r)   )!rT   r  rX   r  r   r,  r-  r  r:   r/  r  r  r  rk  r'   hidden_states_txthidden_states_imghidden_states_cross_modalr9   r:  r=  r;   r8   r  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossr.  text_to_image_losstext_to_cross_lossimage_to_cross_lossr^  s!                                    r4   rg   z)BridgeTowerForContrastiveLearning.forward  s   j &1%<kk$+B]""))%!'%/!%# # 
 
 2=L--'!*%0@G!!gaj 	H,.G (+(,#/<CPPQ]^^&*&6&L&LJtQej9I9_9f9mnnn'
 '

)(
)
) 	$ 'CCDXYY\ww m--d.@.@QQQPQSTSTSTWAU.V.V\^bc-dd}..t/B/B<PQPQPQSTVWVWVWPWCX/Y/Y_aef.ggjj% k 
 
 }..t/H/H/W/W]_cd.eehh% i 
 
 k<FBOOO&**,,//{7I/JJ$|K9I9IJJ[X$|K9I9IJJ[X %\<>>;K;K L L{ Z 	]\#f++fmDDDF!#!<!<=QSY!Z!Z!#!<!<=QSY!Z!Z"$-"="=>SU["\"\*-??BUUY\\H 	Nk<FQRQSQSTF-5-AXK&((vM+#%%!/)
 
 
 	
r3   )NNNNNNNNNTNN)r*   r+   r,   rG   r   r   r.   rM  r/   r`   r   r6   r1   rg   ri   rj   s   @r4   r  r    s|             156:594815155948,0/3&*&*x
 x
E,-x
 !!23x
 !!12	x

 u01x
 U-.x
 E-.x
   12x
 u01x
 $D>x
 'tnx
 d^x
 d^x
 
+U53D-EE	Fx
 x
 x
 ^x
 x
 x
 x
 x
r3   r  )r  ru  rd  r
  r  )r   )Pr-   r=  collectionsr   dataclassesr   typingr   r   r.   r   torch.nnr   activationsr
   r   cache_utilsr   r   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   utils.deprecationr   configuration_bridgetowerr   r   r    
get_loggerr*   r  _TOKENIZER_FOR_DOCr$   r6   r  r=   rl   r   r   r   r   r   r  r	  r  r\  rY  rm  r  r  r  r  r  r  r  r
  rO  r  r[  rd  ru  rz  r  __all__r2   r3   r4   <module>r     s=       # # # # # # ! ! ! ! ! ! " " " " " " " "        % % % % % % 6 6 6 6 6 6 6 6 C C C C C C C C C C 9 9 9 9 9 9              . - - - - - l l l l l l l l l l 7 7 7 7 7 7 7 7 7 7 0 0 0 0 0 0 h h h h h h h h h h 
	H	%	%'    
: : : : :[ : :  :$   
: : : : :; : :  :4) ) ) ) )29 ) ) )X    RY   6P P P P P") P P Pf7" 7" 7" 7" 7"29 7" 7" 7"td d d d d29 d d d4    BI       bi        	       	    B. B. B. B. B.ry B. B. B.L %' #3 3 3 3 329 3 3 3l= = = = =	 = = =@G G G G G5 G G GVY
 Y
 Y
 Y
 Y
RY Y
 Y
 Y
zV= V= V= V= V=	 V= V= V=t4 4 4 4  % % % % % % % %DY Y Y Y Y7 Y Y Y   U
 U
 U
 U
 U
5 U
 U
 U
p   
oJ oJ oJ oJ oJ1 oJ oJ 
oJf	       "                  
d
 d
 d
 d
 d
7 d
 d
 
d
N   ]
 ]
 ]
 ]
 ]
*D ]
 ]
 ]
@          
G
 G
 G
 G
 G
(B G
 G
 
G
T  r3   