
     `iYq                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	c m
Z d dlm	Z	 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ee G d de                                  Z* ed           G d de	j+                              Z, G d de	j+                  Z- G d de	j+                  Z. G d de	j+                  Z/	 d=de	j+        dej0        d ej0        d!ej0        d"eej0                 d#e1d$e1fd%Z2 G d& d'e	j+                  Z3 G d( d)e          Z4 G d* d+e	j+                  Z5 G d, d-e	j+                  Z6e G d. d/e                      Z7 ed01           G d2 d3e7                      Z8 ed41           G d5 d6e7                      Z9d7ej0        d8ej0        fd9Z:e G d: d;e7                      Z;g d<Z<dS )>    N)	dataclass)AnyCallableOptional)nn   )ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargs)deprecate_kwarg)check_model_inputs   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )Aimv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r$   r%   N)getattrto_tuple).0kselfs     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>z'Aimv2Output.to_tuple.<locals>.<genexpr>K   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
    )tuplekeysr-   s   `r.   r*   zAimv2Output.to_tupleJ   sC     
 
 
 
YY[[
 
 
 
 
 	
r0   )__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r    r!   r"   r#   r$   r   r%   r1   r   r*    r0   r.   r   r   ,   s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-4444818886:3:::
%* 
 
 
 
 
 
r0   r   RMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )Aimv2RMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z;
        Aimv2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr8   onesweightvariance_epsilon)r-   hidden_sizeeps	__class__s      r.   rB   zAimv2RMSNorm.__init__S   sD     	l5:k#:#:;; #r0   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )N   T)keepdim)	dtypetor8   float32powmeanrsqrtrF   rE   )r-   hidden_statesinput_dtypevariances       r.   forwardzAimv2RMSNorm.forward[   s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r0   c                 H    t          | j        j                   d| j         S )Nz, eps=)r1   rE   shaperF   r3   s    r.   
extra_reprzAimv2RMSNorm.extra_reprb   s&    )**II$2GIIIr0   )r?   )r4   r5   r6   rB   rW   rZ   __classcell__rI   s   @r.   r>   r>   Q   sb        $ $ $ $ $ $; ; ;J J J J J J Jr0   r>   c                   $     e Zd Z fdZd Z xZS )Aimv2MLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j        | j        | j        |j                  | _
        t          |j                 | _        d S )Nbias)rA   rB   configrG   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr-   rb   rI   s     r.   rB   zAimv2MLP.__init__g   s    !-!'!94#3T5KRXRabbby!143IPVP_```4#94;KRXRabbbV./r0   c                     |                      |                     |                     |                    |                     |          z            }|S N)rh   rj   rf   rg   )r-   xrh   s      r.   rW   zAimv2MLP.forwardq   sA    NN4;;t~~a/@/@#A#ADLLQROO#STT	r0   )r4   r5   r6   rB   rW   r[   r\   s   @r.   r^   r^   f   sG        0 0 0 0 0      r0   r^   c                        e Zd Zdef fdZedddej        fdej        fd            Z	dej        dej        fd	Z
 xZS )
Aimv2VisionEmbeddingsrb   c                    t                                                       || _        |j        | _        t	          j        |j        |j        |j        |j                  | _        t          |j        |j
                  | _        |j        |j        z  dz  }| j        j        st	          j        ||j                  | _        |                     dt#          j        |                              d          d           d S )N)kernel_sizestriderK   position_idsr   rL   F
persistent)rA   rB   rb   
patch_sizer   Conv2dnum_channelsrG   patch_embedr>   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr8   arangeexpand)r-   rb   num_patchesrI   s      r.   rB   zAimv2VisionEmbeddings.__init__w   s     +9!3AR[a[l
 
 
 %V%79LMM(F,==!C{$ 	T&(l;@R&S&SD#^U\+-F-F-M-Mg-V-Vchiiiiir0      g     @cpur&   c                    t          j        t          |          ||          }t          j        t          |           ||          }t          j        ||d          \  }}|dz  }t          j        |||          |z  }	d||	z  z  }	|                                d         |	d d d f         z  }
|                                d         |	d d d f         z  }t          j        |
                                |
                                |                                |                                gd          d d d d d f         S )	NrN   devicexy)indexing   g      ?).Nr   dim)r8   r   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   rN   grid_wgrid_hpos_dimomegaout_hout_ws               r.   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s-    c%jjfEEEc&kkvFFFFFFq.WE&AAAGK{E)*  +eD!!!Gn<  +eD!!!Gn<|UYY[[%))++uyy{{EIIKKPVWXXXY]_`_`_`bcbcbcYcddr0   pixel_valuesc                    |                                 \  }}}}|                     |                              d                              dd          }|                     |          }| j        j        r?|                     || j        z  || j        z  | j        j	        |j
        |j                  }n|                     | j                  }||z   }|S )NrK   r   )r   r   rN   )sizer{   r   	transposer}   rb   r   r   rx   rG   r   rN   r   rt   )r-   r   _r   r   rT   	pos_embeds          r.   rW   zAimv2VisionEmbeddings.forward   s    *//111fe((66>>qAAKKAqQQm44;  		C??$/)(+1$+#) @  II //0ABBI%	1r0   )r4   r5   r6   r   rB   staticmethodr8   rP   Tensorr   rW   r[   r\   s   @r.   rp   rp   v   s        j0 j j j j j j !$'%u}e e	e e e \e EL U\        r0   rp   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
Aimv2TextEmbeddingsrb   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nrt   ru   Frv   )rA   rB   rG   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r8   r   r   )r-   rb   r   rI   s      r.   rB   zAimv2TextEmbeddings.__init__   s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r0   N	input_idsrt   inputs_embedsr&   c                 .   ||j         d         n|j         d         }| j        j        j         d         }||k    rt          d| d|           || j        d d d |f         }||                     |          }|                     |          }||z   }|S )NrL   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rY   r   rE   
ValueErrorrt   r   )r-   r   rt   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingss           r.   rW   zAimv2TextEmbeddings.forward   s     -6,AY_R((}GZ[]G^
!%!8!?!Ea!H...VV V=SV V  
 ,QQQ^<L  00;;M"55lCC"%88
r0   NNN)r4   r5   r6   r   rB   r   r8   
LongTensorr9   r   rW   r[   r\   s   @r.   r   r      s        

 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r0   r           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )NrL   r   )r   rN   )ptrainingr   rK   )r8   matmulr   r   
functionalsoftmaxrP   rO   rN   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r0   c            
            e Zd ZdZ fdZ	 ddej        deej                 deej        eej                 f         fdZ	 xZ
S )	Aimv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr`   )rA   rB   rb   rG   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   rd   qkv_biask_projv_projq_projout_projrk   s     r.   rB   zAimv2Attention.__init__   s0   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/iV_UUUiV_UUUiV_UUU	$.$.vWWWr0   NrT   r   r&   c           
         |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| |||	|| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   rK   eagerr   )r   r   r   )rY   r   r   r   viewr   r   r   r   rb   _attn_implementationr   r   r   r   r   reshaper   r   )r-   rT   r   r   
batch_sizer   r   queriesr2   valuesattention_interfacer   r   s                r.   rW   zAimv2Attention.forward   sy    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00L((r0   rm   )r4   r5   r6   r7   rB   r8   r   r   r1   rW   r[   r\   s   @r.   r   r      s        GGX X X X X, 26$) $)|$) !.$)
 
u|Xel33	4$) $) $) $) $) $) $) $)r0   r   c            	       v     e Zd Zdef fdZ	 d	dej        deej                 dee	         dej        fdZ
 xZS )
Aimv2EncoderLayerrb   c                    t                                                       t          |          | _        t	          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        d S rm   )rA   rB   r   	attentionr^   ffnr>   rG   r|   	rms_norm1	rms_norm2rk   s     r.   rB   zAimv2EncoderLayer.__init__'  si    '//F##%f&8&:MNN%f&8&:MNNr0   NrT   r   r   r&   c                     |                      |          } | j        d||d|\  }}||z   }|                     |          }|                     |          }||z   }|S )N)rT   r   r;   )r   r   r   r   )r-   rT   r   r   norm_hidden_statesr   r   
mlp_outputs           r.   rW   zAimv2EncoderLayer.forward.  sy     "^^M::'r6HYgrrkqrrQ%3!^^M::XX011
%
2r0   rm   )r4   r5   r6   r   rB   r8   r   r   r   r   rW   r[   r\   s   @r.   r   r   &  s        O0 O O O O O O 26 | !. +,	
 
       r0   r   c                   r     e Zd ZdZdef fdZe	 d	deej	                 de
e         defd            Z xZS )
Aimv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Aimv2EncoderLayer`].

    Args:
        config: Aimv2Config
    rb   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r;   )r   )r+   r   rb   s     r.   
<listcomp>z)Aimv2Encoder.__init__.<locals>.<listcomp>K  s"    $h$h$h1%6v%>%>$h$h$hr0   F)	rA   rB   rb   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrk   s    `r.   rB   zAimv2Encoder.__init__H  s`    m$h$h$h$hfNfHgHg$h$h$hii&+###r0   Nr   r   r&   c                 N    |}| j         D ]} |||fi |}t          |          S )N)last_hidden_state)r   r   )r-   r   r   r   rT   encoder_layers         r.   rW   zAimv2Encoder.forwardO  sU     &![ 	 	M)M   MM ????r0   rm   )r4   r5   r6   r7   r   rB   r   r   r8   r   r   r   r   rW   r[   r\   s   @r.   r   r   ?  s         ,{ , , , , , ,  26@ @ !.@ +,	@
 
@ @ @ ^@ @ @ @ @r0   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Aimv2AttentionPoolingHeadrb   c                    t                                                       |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j
        t          j        dd| j                            | _        t          j        | j        | j        d          | _        d S )Nr`   r   T)rA   rB   rG   r   r   r   rd   r   r   r   rC   r8   zeros	cls_tokenoutput_projrk   s     r.   rB   z"Aimv2AttentionPoolingHead.__init__b  s    !-3i 0$2BYYYi 0$2BYYYek!Q8H&I&IJJ9T%5t7GdSSSr0   rT   r&   c                    |j         \  }}}| j                            |dd          }|                     |                              ||| j        || j        z            }|                     |                              ||| j        || j        z            }|                    |d| j        || j        z            }|                    dddd          }|                    dddd          }|                    dddd          }t          j	        |||          }	|	
                    dd                              |d|          }	|	                    d          }	|                     |	          }
|
S )NrL   r   r   rK   r   r   )rY   r   r   r   r   r   r   permuteFscaled_dot_product_attentionr   rR   r   )r-   rT   r   seq_len
hidden_dimr   r   r   r   r   outputs              r.   rW   z!Aimv2AttentionPoolingHead.forwardm  s\   *7*='
GZN))*b"==	kk-((00WdnV`dhdrVrssM**22:wXbfjftXtuu!!*at~A]^^kk!Q1%%aAq))aAq))4UCGG!++Aq1199*aTT!&&1&--!!+..r0   )	r4   r5   r6   r   rB   r8   r   rW   r[   r\   s   @r.   r   r   a  sr        	T0 	T 	T 	T 	T 	T 	TU\ el        r0   r   c                   J     e Zd ZU dZeed<   dZdZg dZdZ	dZ
dZ fdZ xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rb   aimv2T)r   r   rp   r   c                    t                                          |           t          |d          rTt          |j        t
          j                  r3|j        j                            t          j
        d                     d S d S t          |t                    r-|j        j                            d| j        j                   d S d S )Nlogit_scaleg$I$I,@r   )rR   std)rA   _init_weightshasattr
isinstancer  r   rC   datafill_mathlogr   r   normal_rb   initializer_range)r-   r   rI   s     r.   r  z"Aimv2PreTrainedModel._init_weights  s    f%%%6=)) 	W&,bl;; B"'--dhx.@.@AAAAAB B 9:: 	W!))s8U)VVVVV	W 	Wr0   )r4   r5   r6   r7   r   r:   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr  r[   r\   s   @r.   r	  r	    s          
 &*#   NW W W W W W W W Wr0   r	  zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc            
            e Zd ZU eed<   dZeedZdef fdZ	de
j        fdZ edd	           ed
          e	 ddeej                 dee         defd                                    Z xZS )Aimv2VisionModelrb   r   rT   
attentionsc                 \   t                                          |           || _        t          |          | _        t          |          | _        t          |j        |j	                  | _
        |j        | _        | j        rt          |          | _        |                                  d S rm   )rA   rB   rb   rp   r   r   encoderr>   rG   r|   r}   use_headr   head	post_initrk   s     r.   rB   zAimv2VisionModel.__init__  s       /77#F++$V%79LMM= 	:1&99DIr0   r&   c                     | j         j        S rm   )r   r{   r3   s    r.   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    **r0   r   zv4.58.0)versionFtie_last_hidden_statesNr   c                     |                      |          } | j        dd|i|}|j        }|                     |          }| j        r|                     |          nd}t          ||          S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   Nr   pooler_outputr;   )r   r#  r   r}   r$  r%  r   )r-   r   r   r   rT   encoder_outputsr   r.  s           r.   rW   zAimv2VisionModel.forward  s    : 55+74< ,
 ,
',
,
 ,

 ,= MM*;<<8<O		"34444)/'
 
 
 	
r0   rm   )r4   r5   r6   r   r:   main_input_namer   r   _can_record_outputsrB   r   Moduler(  r   r   r   r   r8   r   r   r   r   rW   r[   r\   s   @r.   r  r    s         $O*$ 
0      +bi + + + + _%y999u555 26)
 )
 !.)
 +,	)

 
$)
 )
 )
 ^ 65 :9)
 )
 )
 )
 )
r0   r  zJ
    The text model from AIMv2 without any head or projection on top.
    c            	            e Zd ZdZeedZdef fdZde	j
        fdZd Z ed	          e	 ddeej                 dee         defd                        Z xZS )Aimv2TextModelr   r   rb   c                 &   t                                          |           || _        t          |          | _        t          |          | _        t          |j        |j	                  | _
        |j        | _        |                                  d S rm   )rA   rB   rb   r   r   r   r#  r>   rG   r|   r}   eos_token_idr&  rk   s     r.   rB   zAimv2TextModel.__init__  sx       -f55#F++$V%79LMM"/r0   r&   c                     | j         j        S rm   r   r   r3   s    r.   r(  z#Aimv2TextModel.get_input_embeddings  s    ..r0   c                     || j         _        d S rm   r8  )r-   r   s     r.   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings  s    */'''r0   Fr*  Nr   r   c                    |                      |          }|j        \  }}}t          j        |t          j        |j                  }|                    d                              |d          }	|t          | j	        ||	||d           } | j
        d	||d|}
|
j        }|                     |          }|t          j        |j        d         |j                  |                    t          j        |j                  | j        k                                                        d          f         }t#          ||          S )
Nr   r   rL   )rb   input_embedsrt   r   cache_positionpast_key_values)r   r   )r   r   r-  r;   )r   rY   r8   r   longr   	unsqueezer   r   rb   r#  r   r}   rO   r   r6  argmaxr   )r-   r   r   r   rT   r   r  r   r=  rt   r/  r   pooled_outputs                r.   rW   zAimv2TextModel.forward  sl    	22!.!4
GQgUZH\]]]%//2299*bII%/{*)-- $  N '$, 
')
 
 
 
 ,= MM*;<< *L*03<M<TUUU\\	2C2J\KKtO``eeggnnsunvvx

 */'
 
 
 	
r0   rm   )r4   r5   r6   r0  r   r   r1  r   rB   r   r2  r(  r:  r   r   r   r8   r   r   r   r   rW   r[   r\   s   @r.   r4  r4    s         "O +$ 
	 	 	 	 	 	 	/bi / / / /0 0 0 u555 26'
 '
 !.'
 +,	'

 
$'
 '
 '
 ^ 65'
 '
 '
 '
 '
r0   r4  tensorr&   c                     t          j        | d          }t          j        |dd          }t          j        |d          }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
    rK   rL   T)r   rM   g      ?)r8   rQ   sum)rC  square_tensor
sum_tensornormed_tensors       r.   _get_vector_normrI  7  sB    
 Ifa((M=b$???JIj#..Mr0   c                       e Zd ZU eed<   g dZdZdef fdZ e            e		 	 dde
j        dee
j                 dee
j                 d	e
j        fd
                        Z e            e		 dde
j        ded	e
j        fd                        Ze	e	 	 	 ddee
j                 dee
j                 dee
j                 dee         d	ef
d                        Z xZS )
Aimv2Modelrb   )r   r   rp   Tc                    t                                          |           |j        | _        |j        j        | _        |j        j        | _        t          	                    |j                  | _
        t          	                    |j                  | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        t%          j        | j        j                            | _        t/          j        |j                  | _        |                                  d S )NFr`   )rA   rB   projection_dimvision_configrG   vision_embed_dimtext_configtext_embed_dimr  _from_configvision_modelr4  
text_modelr   rd   visual_projectiontext_projectionrC   r8   rC  rb   logit_scale_init_valuer  r  r  max_logit_scalemax_log_logit_scaler&  rk   s     r.   rB   zAimv2Model.__init__H  s       $3 & 4 @$0<,99&:NOO(55f6HII!#4+@$BU\a!b!b!b!y)<d>QX]^^^<T[5W(X(XYY#'8F,B#C#C r0   Nr   r   rt   r&   c                 n    |                      |||          }|j        }|                     |          }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Aimv2TextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   rt   )rT  r.  rV  )r-   r   r   rt   text_outputsrB  text_featuress          r.   get_text_featureszAimv2Model.get_text_featuresZ  sI    6 48??)% 4C 4
 4

 %2,,];;r0   Fr   interpolate_pos_encodingc                 l    |                      ||          }|j        }|                     |          }|S )an  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Aimv2VisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, Aimv2Model
        >>> from transformers.image_utils import load_image

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   r^  )rS  r.  rU  )r-   r   r^  vision_outputsrB  image_featuress         r.   get_image_featureszAimv2Model.get_image_features  sH    < 6:5F5F%%= 6G 6
 6
 '4//>>r0   r   c                     | j         dd|i|} | j        d||d|}|j        }|                     |          }|j        }|                     |          }|t          |          z  }|t          |          z  }| j                            d| j                  	                                
                    |j                  }	|	|z  |                                z  }
|
                                }t          ||
||||          S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r   r   r   )r    r!   r"   r#   r$   r%   r;   )rS  rT  r.  rU  rV  rI  r  clamprY  exprO   r   tr   )r-   r   r   r   r   r`  r[  r#   r"   r  r!   r    s               r.   rW   zAimv2Model.forward  sK   > 6GT5F 6
 6
%6
6
 6

 4C4? 4
)4
 4
 4
 4
 &3--l;;"0**;77 $&6|&D&DD!$4[$A$AA&,,S$2JKKOOQQTTU`Ughh&48H8HH*,,..-+#%* .
 
 
 	
r0   )NN)Fr   )r4   r5   r6   r   r:   r  r  rB   r   r   r8   r   r   r9   r]  boolrb  r   r   r   r   r   rW   r[   r\   s   @r.   rK  rK  B  s        ]]]{      $ %$&& 26/3	! !<! !.! u|,	!
 
	! ! ! ^ '&!F %$&& */# #'# #'# 
		# # # ^ '&#J  154815	=
 =
E,-=
 u01=
 !.	=

 +,=
 
=
 =
 =
  ^=
 =
 =
 =
 =
r0   rK  )r  rK  r	  r4  )r   )=r  dataclassesr   typingr   r   r   r8   torch.nn.functionalr   r   r  activationsr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   utils.genericr   configuration_aimv2r   r   r   r   r2  r>   r^   rp   r   r   floatr   r   r   r   r   r	  r  r4  rI  rK  __all__r;   r0   r.   <module>rx     s  .  ! ! ! ! ! ! * * * * * * * * * *                 ! ! ! ! ! ! 7 7 7 7 7 7 / / / / / / 9 9 9 9 9 9 K K K K K K K K F F F F F F F F & & & & & & w w w w w w w w w w w w w w 0 0 0 0 0 0 / / / / / / P P P P P P P P P P  
  
  
  
  
+  
  
   
F Y''J J J J J29 J J ('J(    ry    1 1 1 1 1BI 1 1 1h% % % % %") % % %^ % %I%<% 
% <	%
 U\*% % % % % %.:) :) :) :) :)RY :) :) :)z    2   2@ @ @ @ @29 @ @ @D    	   D W W W W W? W W W8   
E
 E
 E
 E
 E
+ E
 E
 
E
P   
B
 B
 B
 B
 B
) B
 B
 
B
JU\ el     b
 b
 b
 b
 b
% b
 b
 b
J W
V
Vr0   