
     `i                        d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e!j'        e(          Z)de	j*        de	j*        fdZ+de	j*        de	j*        fdZ,ee G d de                                  Z- G d dej.                  Z/ G d dej.                  Z0 G d dej.                  Z1de0iZ2 G d dej.                  Z3 G d dej.                  Z4 G d  d!ej.                  Z5 G d" d#e          Z6 G d$ d%ej.                  Z7 G d& d'ej.                  Z8	 dKd)ej.        d*e	j*        d+e	j*        d,e	j*        d-ee	j*                 d.e9d/e9fd0Z: G d1 d2ej.                  Z; G d3 d4ej.                  Z< G d5 d6e          Z= G d7 d8ej.                  Z> G d9 d:ej.                  Z?e G d; d<e                      Z@ G d= d>ej.                  ZA G d? d@e@          ZB edAB           G dC dDe@                      ZC G dE dFe@          ZD G dG dHe@          ZEdLdIZFg dJZGdS )MzPyTorch AltCLIP model.    N)	dataclass)AnyCallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     t           j                            | t          j        t          |           | j                            S )Ndevice)nn
functionalcross_entropytorcharangelenr"   )r   s    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr*   +   s3    =&&vu|CKKPVP]/^/^/^___    
similarityc                 r    t          |           }t          |                                           }||z   dz  S )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   /   s4    #J//L!*,,..11J:%,,r+   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r9   r:   N)getattrto_tuple).0kselfs     r)   	<genexpr>z)AltCLIPOutput.to_tuple.<locals>.<genexpr>U   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r+   )tuplekeysrA   s   `r)   r>   zAltCLIPOutput.to_tupleT   sC     
 
 
 
YY[[
 
 
 
 
 	
r+   )__name__
__module____qualname____doc__r4   r   r&   FloatTensor__annotations__r5   r6   r7   r8   r9   r   r:   rC   r   r>    r+   r)   r3   r3   5   s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-4444818886:3:::
%* 
 
 
 
 
 
r+   r3   c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )AltRobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 l   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           |j        | _        t          j        |j        |j        | j                  | _	        d S )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r#   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr=   rS   register_bufferr&   r'   expandzerosrU   sizelongrP   rA   config	__class__s     r)   r^   zAltRobertaEmbeddings.__init__b   s}   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	

 ".#%<*F,>DL\$
 $
 $
   r+   Nr   c                    |.|t          || j        |          }n|                     |          }||                                }n|                                d d         }|d         }|mt	          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t          j	        | j
        j                  }||                     |          }|                     |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )NrW   r   rZ   r   r\   r"   rT   )"create_position_ids_from_input_idsrP   &create_position_ids_from_inputs_embedsrp   hasattrrZ   rn   r&   ro   rq   rU   r"   rc   rg   rS   re   rh   rl   )rA   	input_idsrZ   rU   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrg   
embeddingsre   s                r)   forwardzAltRobertaEmbeddings.forward{   s{    $A)TM]_uvv#JJ=YY #..**KK',,..ss3K ^

 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r+   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrW   r   rv   r   )rp   r&   r'   rP   rq   r"   	unsqueezern   )rA   r{   r}   sequence_lengthrU   s        r)   rx   z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r+   )NNNNr   )rF   rG   rH   rI   r^   r   rx   __classcell__rt   s   @r)   rN   rN   \   sm         

 
 
 
 
4 rs& & & &P= = = = = = =r+   rN   c                        e Zd Zd
 fd	Z	 	 	 ddej        deej                 deej                 dee         de	ej                 f
d	Z
 xZS )AltRobertaSelfAttentionNc                 0   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r8|j        | _        t          j        d
|j        z  dz
  | j                  | _        d S d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rS   rT   relative_keyrelative_key_query   r   )r]   r^   ra   num_attention_headsry   
ValueErrorintattention_head_sizeall_head_sizer#   Linearquerykeyvaluerj   attention_probs_dropout_probrl   r=   rS   rd   r_   distance_embeddingrA   rs   rS   rt   s      r)   r^   z AltRobertaSelfAttention.__init__   s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD### >r=qr+   Fhidden_statesattention_mask	head_maskoutput_attentionsr   c                 T   |j         d d         }g |d| j        R }|                     |                              |                              dd          }|                     |                              |                              dd          }|                     |                              |                              dd          }	t          j        ||                    dd                    }
| j	        dk    s| j	        dk    r4|j         d         |j         d         }}t          j
        |t          j        |j                                      dd          }t          j
        |t          j        |j                                      dd          }||z
  }|                     || j        z   dz
            }|                    |j                  }| j	        dk    rt          j        d	||          }|
|z   }
n?| j	        dk    r4t          j        d	||          }t          j        d
||          }|
|z   |z   }
|
t%          j        | j                  z  }
||
|z   }
t(          j                            |
d          }|                     |          }|||z  }t          j        ||	          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}|S )NrW   r   r   r   r   rv   r[   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   )shaper   r   view	transposer   r   r&   matmulrS   r'   rq   r"   r   rd   tor\   einsummathsqrtr#   r$   softmaxrl   permute
contiguousrp   r   )rA   r   r   r   r   r}   hidden_shapequery_layer	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                           r)   r   zAltRobertaSelfAttention.forward   s/    $)#2#.CCbC$*BCCjj//44\BBLLQPQRRHH]++00>>HHANN	jj//44\BBLLQPQRR !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L"\,ejQ^QefffkklnpqrrN"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r+   NNNF)rF   rG   rH   r^   r&   Tensorr   rJ   boolrC   r   r   r   s   @r)   r   r      s        u u u u u u6 7;15,1: :|: !!23: E-.	:
 $D>: 
u|	: : : : : : : :r+   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )AltRobertaSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrQ   )r]   r^   r#   r   ra   denserh   ri   rj   rk   rl   rr   s     r)   r^   zAltRobertaSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==r+   r   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   rl   rh   rA   r   r   s      r)   r   zAltRobertaSelfOutput.forward  @    

=11]33}|'CDDr+   rF   rG   rH   r^   r&   r   r   r   r   s   @r)   r   r     i        > > > > >U\  RWR^        r+   r   eagerc                        e Zd Zd fd	Zd Z	 	 	 ddej        deej                 deej                 dee	         d	e
ej                 f
d
Z xZS )AltRobertaAttentionNc                     t                                                       t          |j                 ||          | _        t          |          | _        t                      | _        d S )N)rS   )	r]   r^   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrA   r   outputsetpruned_headsr   s      r)   r^   zAltRobertaAttention.__init__   s`    6v7RS,C
 
 
	 +622EEr+   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )r(   r   rA   r   r   r   r   r   r   r   r   r   r   union)rA   headsindexs      r)   prune_headszAltRobertaAttention.prune_heads(  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r+   Fr   r   r   r   r   c                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S N)r   r   r   r   r   )rA   r   )rA   r   r   r   r   self_outputsattention_outputr   s           r)   r   zAltRobertaAttention.forward:  s]     yy)/	 ! 
 
  ;;|AFF#%QRR(88r+   r   r   )rF   rG   rH   r^   r   r&   r   r   rJ   r   rC   r   r   r   s   @r)   r   r     s        " " " " " "; ; ;* 7;15,1 | !!23 E-.	
 $D> 
u|	       r+   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )AltRobertaIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r]   r^   r#   r   ra   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrr   s     r)   r^   zAltRobertaIntermediate.__init__N  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r+   r   r   c                 Z    |                      |          }|                     |          }|S r   )r   r   rA   r   s     r)   r   zAltRobertaIntermediate.forwardV  s,    

=1100??r+   r   r   s   @r)   r   r   M  s^        9 9 9 9 9U\ el        r+   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )AltRobertaOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r]   r^   r#   r   r   ra   r   rh   ri   rj   rk   rl   rr   s     r)   r^   zAltRobertaOutput.__init__^  sf    Yv79KLL
f&8f>STTTz&"<==r+   r   r   r   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r)   r   zAltRobertaOutput.forwardd  r   r+   r   r   s   @r)   r   r   ]  r   r+   r   c                        e Zd Z fdZ	 	 	 ddej        deej                 deej                 dee         de	ej                 f
d	Z
d
 Z xZS )AltRobertaLayerc                     t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        d S )Nr   )
r]   r^   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rr   s     r)   r^   zAltRobertaLayer.__init__m  s^    '-'E$,V4426::&v..r+   NFr   r   r   r   r   c                      | j         |f|||d|}|d         }|dd          }t          | j        | j        | j        |          }	|	f|z   }|S r   )r   r   feed_forward_chunkr   r   )
rA   r   r   r   r   kwargsself_attention_outputsr   r   layer_outputs
             r)   r   zAltRobertaLayer.forwardu  s     "0"
)/	"
 "

 "
 "
 2!4(,0#T%A4CSUe
 
  /G+r+   c                 \    |                      |          }|                     ||          }|S r   )r   r   )rA   r   intermediate_outputr   s       r)   r   z"AltRobertaLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr+   r   )rF   rG   rH   r^   r&   r   r   rJ   r   rC   r   r   r   r   s   @r)   r   r   l  s        / / / / / 7;15,1 | !!23 E-.	
 $D> 
u|	   2      r+   r   c                        e Zd Z fdZe	 	 	 	 	 ddej        deej                 deej                 dee	         d	ee	         d
ee	         de
eej                 ef         fd            Z xZS )AltRobertaEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rL   )r   )r?   irs   s     r)   
<listcomp>z.AltRobertaEncoder.__init__.<locals>.<listcomp>  s!    #e#e#eOF$;$;#e#e#er+   F)	r]   r^   rs   r#   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrr   s    `r)   r^   zAltRobertaEncoder.__init__  s`    ]#e#e#e#eU6KcEdEd#e#e#eff
&+###r+   NFTr   r   r   r   output_hidden_statesreturn_dictr   c           	          |rdnd }|rdnd }	t          | j                  D ]<\  }
}|r||fz   }|||
         nd } |d||||d|}|d         }|r|	|d         fz   }	=|r||fz   }t          |||	          S )NrL   )r   r   r   r   r   r   last_hidden_stater   
attentions)	enumerater  r   )rA   r   r   r   r   r
  r  r   all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss                 r)   r   zAltRobertaEncoder.forward  s     #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO(L +-)"3	 
  M *!,M  P&9]1=M<O&O# 	E 1]4D D++*
 
 
 	
r+   )NNFFT)rF   rG   rH   r^   r   r&   r   r   rJ   r   r   rC   r   r   r   r   s   @r)   r   r     s        , , , , ,  7;15,1/4&*&
 &
|&
 !!23&
 E-.	&

 $D>&
 'tn&
 d^&
 
uU\"O3	4&
 &
 &
 &
 &
 &
 &
 &
r+   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )AltRobertaPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r]   r^   r#   r   ra   r   Tanh
activationrr   s     r)   r^   zAltRobertaPooler.__init__  sC    Yv163EFF
'))r+   r   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r  )rA   r   first_token_tensorpooled_outputs       r)   r   zAltRobertaPooler.forward  s@     +111a40

#56666r+   r   r   s   @r)   r  r    s^        $ $ $ $ $
U\ el        r+   r          moduler   r   r   r   scalingrl   c                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )NrW   r   )r   r\   )ptrainingr   r   )r&   r   r   r#   r$   r   float32r   r\   rl   r#  r   )
r  r   r   r   r   r   rl   r   attn_weightsattn_outputs
             r)   eager_attention_forwardr'    s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r+   c                        e Zd ZdZ fdZ	 	 	 ddej        deej                 deej                 dee         d	e	ej        eej                 f         f
d
Z
 xZS )AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)r]   r^   rs   ra   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrl   	is_causalr#   r   k_projv_projq_projout_projrr   s     r)   r^   zAltCLIPAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr+   NFr   r   causal_attention_maskr   r   c           
      n   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
| j        j	        dk    r||||z   }n||}n	|du| _
        t          }| j        j	        dk    rt          | j        j	                 } || ||	|
|| j
        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS )z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr   r  )r2  r   rl   )r   r5  r3  r4  r   r.  r/  r   rs   r   r2  r'  r   r0  r#  rl   reshaper   r6  )rA   r   r   r7  r   
batch_sizer~   r-  queriesrD   valuesattention_interfacer&  r%  s                 r)   r   zAltCLIPAttention.forward  s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc ;+/BBB).C.O!/2G!G&2!62$>DN(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r+   r   )rF   rG   rH   rI   r^   r&   r   r   r   rC   r   r   r   s   @r)   r)  r)    s        GGB B B B B. 268<,1/) /)|/) !./)  (5	/)
 $D>/) 
u|Xel33	4/) /) /) /) /) /) /) /)r+   r)  c                   B     e Zd Z fdZdej        dej        fdZ xZS )
AltCLIPMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )r]   r^   rs   r	   r   activation_fnr#   r   ra   r   fc1fc2rr   s     r)   r^   zAltCLIPMLP.__init__9  sf    #F$569V/1IJJ9V5v7IJJr+   r   r   c                     |                      |          }|                     |          }|                     |          }|S r   )rC  rB  rD  r   s     r)   r   zAltCLIPMLP.forward@  s=    //**=99//r+   r   r   s   @r)   r@  r@  8  sc        K K K K KU\ el        r+   r@  c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )AltCLIPEncoderLayerrs   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S r   )r]   r^   ra   r-  r)  	self_attnr#   rh   ri   layer_norm1r@  mlplayer_norm2rr   s     r)   r^   zAltCLIPEncoderLayer.__init__H  s    +)&11<F<QRRRf%%<F<QRRRr+   Fr   r   r7  r   r   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r7  r   )rJ  rI  rL  rK  )rA   r   r   r7  r   residualr%  r   s           r)   r   zAltCLIPEncoderLayer.forwardP  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr+   F)rF   rG   rH   r   r^   r&   r   r   r   rC   rJ   r   r   r   s   @r)   rG  rG  G  s        S} S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r+   rG  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej	                 deej	                 dee
         dee
         d	ee
         d
eeef         fd            Z xZS )AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    rs   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rL   )rG  )r?   _rs   s     r)   r  z+AltCLIPEncoder.__init__.<locals>.<listcomp>  s"    $j$j$jQ%8%@%@$j$j$jr+   F)	r]   r^   rs   r#   r  r  r  layersr	  rr   s    `r)   r^   zAltCLIPEncoder.__init__  sa    m$j$j$j$j%PVPhJiJi$j$j$jkk&+###r+   Nr   r7  r   r
  r  r   c                 @   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]2\  }
}|r||	fz   } ||	|||          }|d         }	|r||d         fz   }3|r||	fz   }t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrL   )r   r   r   r  )rs   r   r
  use_return_dictr  rU  r   )rA   r{   r   r7  r   r
  r  encoder_statesall_attentionsr   idxencoder_layerr  s                r)   r   zAltCLIPEncoder.forward  s    N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N+>Vd
 
 
 	
r+   )NNNNN)rF   rG   rH   rI   r   r^   r   r   r&   r   r   r   rC   r   r   r   r   s   @r)   rQ  rQ  y  s         ,} , , , , , ,  268<,0/3&*D
 D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
 D
 D
 D
 D
 D
 D
r+   rQ  c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )AltCLIPVisionEmbeddingsrs   c                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebiasr   r   rU   rV   rX   )r]   r^   rs   ra   r-  
image_size
patch_sizer#   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr_   position_embeddingrm   r'   rn   rr   s     r)   r^   z AltCLIPVisionEmbeddings.__init__  s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr+   r   heightwidthr   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrW   g      ?r   r   bicubicF)rp   modealign_cornersr   )r   rn  weightr   r&   jit
is_tracingrU   re  r   r:  r   r#   r$   interpolater   cat)rA   r   ro  rp  rl  rn  rm  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encodingz0AltCLIPVisionEmbeddings.interpolate_pos_encoding  s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr+   Fpixel_valuesc                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model (r+  r[   r   r   rW   r   )r   rd  r   rk  ru  r\   r   flattenr   rh  rn   r&   ry  r  rn  rU   )rA   r  r  r;  rT  ro  rp  target_dtypepatch_embedsclass_embedsr   s              r)   r   zAltCLIPVisionEmbeddings.forward  sD   '3'9$
Avu' 	Vt-F-F%SWSbJbJbqVqqeqqDOqq^b^mqqq   +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr+   rO  )rF   rG   rH   r   r^   r&   r   r   r  rJ   r   r   r   s   @r)   r]  r]    s        q2 q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Z_Zf        r+   r]  c                   ,    e Zd ZU eed<   dZdZg Zd ZdS )AltCLIPPreTrainedModelrs   altclipTc                    | j         j        }t          |t                    r| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j	        j
        |j         j        |z             t          j                            |j        j
        |j         j        |z             dS t          |t                    r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j
        |           t          j                            |j        j
        |           t          j                            |j        j
        |           t          j                            |j        j
        |           dS t          |t&                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j
        |           t          j                            |j        j
        |           dS t          |t.                    rt          j                            |j        j
        |j        dz  | j         j        z             d|j        _        t          j                            |j        j
        |j        dz  | j         j        z             d|j        _        dS t          |t          j                  r?|j        j                                          |j
        j        !                    d           dS t          |t          j"                  rT|j
        j                            d| j         j                   |j         |j        j                                          dS dS t          |t          j#                  r]|j
        j                            d| j         j                   |j$        -|j
        j        |j$                                                   dS dS dS )	zInitialize the weightsr  r,  )meanstd)r  r   Tg      ?N)%rs   initializer_factorr   r]  r#   initnormal_rh  r-  rk  ru  initializer_rangern  r)  r  r5  r3  r4  r6  r@  ra   rC  rD  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrh   rc  datazero_fill_r   r_   rP   )rA   r  factorin_proj_stdout_proj_stdfc_stds         r)   _init_weightsz$AltCLIPPreTrainedModel._init_weights+  s   /f566 (	?[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhGOOF5<&-BadjBjOkkkkk 011 #	?[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEEE
++ 	?[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O?????-- 	?GOO&-)4/$+2PP     9=F"5GOO(/+T1DK4RR     ;?F$777-- 
	?K""$$$M$$S)))))	** 	?M&&CT[5S&TTT{& &&((((( '&-- 	?M&&CT[5S&TTT!-"6#56<<>>>>>	? 	?--r+   N)	rF   rG   rH   r   rK   base_model_prefixsupports_gradient_checkpointing_no_split_moduler  rL   r+   r)   r  r  $  sC         !&*#+? +? +? +? +?r+   r  c                        e Zd Zdef fdZee	 	 	 	 	 ddeej	                 dee
         dee
         dee
         d	ee
         d
eeef         fd                        Z xZS )AltCLIPVisionTransformerrs   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )r]   r^   rs   ra   r]  r   r#   rh   ri   pre_layrnormrQ  encoderpost_layernorm)rA   rs   r-  rt   s      r)   r^   z!AltCLIPVisionTransformer.__init__Z  s    &	1&99L8MNNN%f-- l9&:OPPPr+   NFr  r   r
  r  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     |||d          }|d         }|d d dd d f         }	|                     |	          }	t          ||	|j
        |j                  S )Nz You have to specify pixel_values)r  T)r{   r   r
  r  r   r  pooler_outputr   r  )rs   r   r
  rW  r   r   r  r  r  r   r   r  )
rA   r  r   r
  r  r  r   encoder_outputsr  r  s
             r)   r   z AltCLIPVisionTransformer.forwardd  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5	 ' 
 
 ,A.)!!!Q'2++M::)/')7&1	
 
 
 	
r+   )NNNNF)rF   rG   rH   r   r^   r   r   r   r&   rJ   r   r   rC   r   r   r   r   s   @r)   r  r  Y  s        Q2 Q Q Q Q Q Q  59,0/3&*38$
 $
u01$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
 $
 $
 ^ $
 $
 $
 $
 $
r+   r  c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
	 	 	 	 	 ddeej                 dee         d	ee         d
edee         deeef         fd            Z xZS )AltCLIPVisionModelrs   r  c                     t                                          |           t          |          | _        |                                  d S r   )r]   r^   r  vision_model	post_initrr   s     r)   r^   zAltCLIPVisionModel.__init__  sA       4V<<r+   r   c                 $    | j         j        j        S r   )r  r   rk  rE   s    r)   get_input_embeddingsz'AltCLIPVisionModel.get_input_embeddings  s     +;;r+   NFr   r
  r  r  c                 V    ||n| j         j        }|                     |||||          S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nr  r   r
  r  r  )rs   rW  r  )rA   r  r   r
  r  r  s         r)   r   zAltCLIPVisionModel.forward  sC    : &1%<kk$+B]  %/!5%=# ! 
 
 	
r+   )NNNFN)rF   rG   rH   r   rK   main_input_namer^   r#   Moduler  r   r   r&   rJ   r   r   rC   r   r   r   r   s   @r)   r  r    s        $O2      <bi < < < <  59,0/3).&*$
 $
u01$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
 $
 ^$
 $
 $
 $
 $
r+   r  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                   `    e Zd ZU eed<   d fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
ej                 de
e         de
e         de
e         deeej                 ef         fd            Z xZS )AltRobertaModelrs   Tc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd| _        | 	                                 dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r]   r^   rs   rN   r   r   r  r  poolerr  )rA   rs   add_pooling_layerrt   s      r)   r^   zAltRobertaModel.__init__  st    
 	   .v66(002CM&v... 	r+   c                     | j         j        S r   r   rc   rE   s    r)   r  z$AltRobertaModel.get_input_embeddings  s    ..r+   c                     || j         _        d S r   r  rA   r   s     r)   set_input_embeddingsz$AltRobertaModel.set_input_embeddings  s    */'''r+   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r   r   )rA   heads_to_pruner  r   s       r)   _prune_headszAltRobertaModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr+   Nrz   r   rZ   rU   r   r{   r   r
  r  r   c
                    ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        ||f|          }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          }|                     |||||d	          }|d
         }| j        |                     |          nd }t+          |||j        |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerW   z5You have to specify either input_ids or inputs_embedsr!   rZ   rv   )rz   rU   rZ   r{   T)r   r   r   r
  r  r   r  )rs   r   r
  rW  r   %warn_if_padding_and_no_attention_maskrp   r"   r&   onesry   r   rZ   rn   ro   rq   get_extended_attention_maskget_head_maskr  r  r  r   r   r  )rA   rz   r   rZ   rU   r   r{   r   r
  r  r}   r;  r~   r"   r   r   extended_attention_maskembedding_outputr  sequence_outputr  s                        r)   r   zAltRobertaModel.forward  sJ    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"Z*j)A6RRRN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z 150P0PQ_al0m0m &&y$+2OPP	??%)'	 + 
 
 ,,2/!5 ' 
 
 *!,8<8OO444UY)-')7&1	
 
 
 	
r+   )T	NNNNNNNNN)rF   rG   rH   r   rK   r^   r  r  r  r   r   r&   r   r   r   rC   r   r   r   r   s   @r)   r  r    s|               / / /0 0 0C C C  -11515/3,004,0/3&*G
 G
EL)G
 !.G
 !.	G

 u|,G
 EL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\"$PP	QG
 G
 G
 ^G
 G
 G
 G
 G
r+   r  c                       e Zd ZU eed<    fdZdej        fdZdej	        ddfdZ
ddee         dej	        f fd	Zee	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )AltCLIPTextModelrs   c                 0   t                                          |           t          |d          | _        t	          j        |j        |j                  | _        t	          j	        |j        |j
                  | _        |                                  d S )NF)r  rQ   )r]   r^   r  robertar#   r   ra   project_dimtransformationrh   ri   pre_LNr  rr   s     r)   r^   zAltCLIPTextModel.__init__<  s{       &vGGG i(:F<NOOl6#56;PQQQr+   r   c                 $    | j         j        j        S r   r  r   rc   rE   s    r)   r  z%AltCLIPTextModel.get_input_embeddingsC  s    |&66r+   r   Nc                 (    || j         j        _        d S r   r  r  s     r)   r  z%AltCLIPTextModel.set_input_embeddingsF  s    27///r+   new_num_tokensc                 F    t                                          |          S r   )r]   resize_token_embeddings)rA   r  rt   s     r)   r  z(AltCLIPTextModel.resize_token_embeddingsI  s    ww..~>>>r+   rz   r   rZ   rU   r   r{   r   r  r
  c
                    ||n| j         j        }|                     ||||||||	d	  	        }
|
d         }|                     |          }|                     |          }|dddf         }t          |||
j        |
j                  S )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NT)	rz   r   rZ   rU   r   r{   r   r
  r  r   r  )rs   rW  r  r  r  r   r   r  )rA   rz   r   rZ   rU   r   r{   r   r  r
  r   r  projection_stater  s                 r)   r   zAltCLIPTextModel.forwardL  s    @ &1%<kk$+B],,))%'/!5  

 

 "!* ++o66  ..??(A.6.'!/)	
 
 
 	
r+   r   r  )rF   rG   rH   r   rK   r^   r#   r  r  r_   r  r   r   r  r   r   r&   r   r   r   rC   r   r   r   r   s   @r)   r  r  9  s            7bi 7 7 7 78", 84 8 8 8 8? ?hsm ?r| ? ? ? ? ? ?  -11515/3,004,0&*/3;
 ;
EL);
 !.;
 !.	;

 u|,;
 EL);
  -;
 $D>;
 d^;
 'tn;
 
u==	>;
 ;
 ;
 ^ ;
 ;
 ;
 ;
 ;
r+   r  c                   >    e Zd ZU eed<   def fdZ e            e	 	 	 ddej	        de
ej	                 de
ej	                 de
ej	                 dej        f
d	                        Z e            e	 ddej        dedej        fd                        Ze	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej	                 de
ej                 de
ej	                 de
e         de
e         de
e         dede
e         deeef         fd            Z xZS )r  rs   c                 <   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }|j	        |_	        |j
        | _
        |j        | _        |j        | _        t          |          | _        t#          |          | _        t'          j        | j        | j
        d          | _        t'          j        | j        | j
        d          | _        t'          j        t1          j        | j        j                            | _        |                                  d S )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)rc  )r]   r^   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  r  ra   r  r  
text_modelr  r  r#   r   r  r  rf  r&   tensorrs   logit_scale_init_valuelogit_scaler  )rA   rs   r  r  rt   s       r)   r^   zAltCLIPModel.__init__  s      &.0CDD 	2-..2 2 2   &,.?@@ 	0+,,0 0 0  
 (,-3-H*$3)5 - 9*;774]CC!#4+@$BU\a!b!b!b!y)<d>QX]^^^<T[5W(X(XYY 	r+   Nrz   r   rU   rZ   r   c                 p    |                      ||||          }|j        }|                     |          }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)rz   r   rU   rZ   )r  r  r  )rA   rz   r   rU   rZ   text_outputsr  text_featuress           r)   get_text_featureszAltCLIPModel.get_text_features  sK    6 )%)	 ' 
 
 %2,,];;r+   Fr  r  c                 l    |                      ||          }|j        }|                     |          }|S )aQ  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r  r  )r  r  r  )rA   r  r  vision_outputsr  image_featuress         r)   get_image_featureszAltCLIPModel.get_image_features  sG    : **%%= + 
 
 '4//>>r+   return_lossr   r
  r  c           	         ||n| j         j        }||n| j         j        }|
|
n| j         j        }
|                     |||||||
          }|                     ||||	|
          }|d         }|                     |          }|d         }|                     |          }||                    ddd          z  }||                    ddd          z  }| j	        
                                }t          j        ||                                          |z  }|j        }d}|rt          |          }|
s||||||f}||f|z   n|S t!          |||||||	          S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)rz   r   rZ   rU   r   r
  r  r  r   r   rW   T)r"  r   keepdim)r4   r5   r6   r7   r8   r9   r:   )rs   r   r
  rW  r  r  r  r  normr  expr&   r   r.   Tr1   r3   )rA   rz   r  r   rU   rZ   r  r   r
  r  r  r  r  r8   r7   r  r6   r5   r4   r   s                       r)   r   zAltCLIPModel.forward  s   J 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]))%/!5# ' 
 
 **%/!5%=# + 
 
 &a(--l;;"1o**;77 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO &**,,,{LNN4D4DEES*, 	._--D 	F&lT`bpqF)-)9TGf$$vE-+#%* .
 
 
 	
r+   )NNNrO  )
NNNNNNNNFN)rF   rG   rH   r   rK   r^   r   r   r&   r   r   rJ   r  r   r  
LongTensorr   rC   r3   r   r   r   s   @r)   r  r    sH        }      B %$&& 26/315" "<" !." u|,	"
 !." 
	" " " ^ '&"H %$&& */" "'" #'" 
		" " " ^ '&"H  1548153715&*,0/3).&*[
 [
E,-[
 u01[
 !.	[

 u/0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
 [
 ^[
 [
 [
 [
 [
r+   r  c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r&   cumsumtype_asrq   )rz   rP   r|   maskincremental_indicess        r)   rw   rw   \  sg     <<$$((**D <!444<<TBBE[[_cc##%%33r+   )r  r  r  r  )r  )r   )HrI   r   dataclassesr   typingr   r   r   r   r&   torch.nnr#   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_altclipr   r   r   
get_loggerrF   loggerr   r*   r1   r3   r  rN   r   r   r   r   r   r   r   r   r  floatr'  r)  r@  rG  rQ  r]  r  r  r  r  r  r  rw   __all__rL   r+   r)   <module>r     s      ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1        ! ! ! ! ! ! 9 9 9 9 9 9            G F F F F F F F l l l l l l l l l l w w w w w w w w w w w w w w w w X X X X X X X X X X 
	H	%	%
`U\ `el ` ` ` `-%, -5< - - - -  
  
  
  
  
K  
  
   
HV= V= V= V= V=29 V= V= V=rS S S S Sbi S S Sn    29    $& "
* * * * *") * * *\    RY        ry   % % % % %0 % % %R.
 .
 .
 .
 .
	 .
 .
 .
d    ry   . % %I%<% 
% <	%
 U\*% % % % % %.F) F) F) F) F)ry F) F) F)T       / / / / /4 / / /dT
 T
 T
 T
 T
RY T
 T
 T
pP P P P Pbi P P Pf 1? 1? 1? 1? 1?_ 1? 1? 1?h1
 1
 1
 1
 1
ry 1
 1
 1
h2
 2
 2
 2
 2
/ 2
 2
 2
j   k
 k
 k
 k
 k
, k
 k
 k
\P
 P
 P
 P
 P
- P
 P
 P
fL
 L
 L
 L
 L
) L
 L
 L
`4 4 4 4  _
^
^r+   