
     `in                     @   d Z ddlZddlmZ ddlZddlmc mZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*  G d de&          Z+ G d de%          Z, G d de$          Z- G d de*          Z. G d de"          Z/ G d de!          Z0 G d d ej1                  Z2 G d! d"e          Z3 G d# d$e(          Z4 G d% d&e          Z5 G d' d(e)          Z6 G d) d*ej1                  Z7e G d+ d,e                      Z8 ed-.           G d/ d0e8                      Z9 ed1.           G d2 d3e8                      Z:e G d4 d5e                      Z;g d6Z<dS )7z%Pytorch implementation of AIMv2 Model    N)Optional)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededededededededededef fdZ xZ	S ) Aimv2VisionConfiga  
    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2816):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
        use_head (`str`, *optional*, defaults to `True`):
            Whether to use Attention Pooling Head or Not.
        is_native (`str`, *optional*, defaults to `False`):
            Whether to use ckpt trained for image native resolution or not.
    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```            r         h㈵>        Fsilu{Gz?Thidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                      t                      j        d|||||||||
d	| || _        || _        |	| _        || _        |
| _        || _        || _        | `	d S )N)	r)   r*   r+   r,   r4   r-   r.   r/   r2    )
super__init__r6   r5   r1   r3   r2   r0   r7   layer_norm_eps)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   kwargs	__class__s                    {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/aimv2/modular_aimv2.pyr;   zAimv2VisionConfig.__init__d   s    & 	 	
#// 3!%!!	
 	
 	
 	
 	
 !!2!2  ("    )r   r    r!   r"   r   r#   r$   r%   r&   FFr'   r(   TF)
__name__
__module____qualname____doc__intfloatboolstrr;   __classcell__r?   s   @r@   r   r   +   s       6 6t  !%!##$"#& #'!(  ( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  (  (  (  (  (  (  (  (  ( rA   r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededededededee         dee         dededef fdZ	 xZ
S ) Aimv2TextConfiga  
    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Aimv2Model`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        pad_token_id (`int`, *optional*, defaults to 1):
            The id of the padding token in the vocabulary.
        bos_token_id (`int`, *optional*, defaults to 49406):
            The id of the beginning-of-sequence token in the vocabulary.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the vocabulary.
        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
                   r%   r&   Fr'   N  M   r(   
vocab_sizer)   r*   r+   r,   r0   r1   r2   r3   r4   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr5   c                      t                      j        d||||||
||||d
| || _        || _        |	| _        || _        || _        | `| `| `	| `
d S )N)
rU   r)   r*   r+   r,   r4   rY   rV   rW   rX   r9   )r:   r;   r5   r1   r3   r2   r0   rW   rV   projection_sizer<   )r=   rU   r)   r*   r+   r,   r0   r1   r2   r3   r4   rV   rW   rX   rY   r5   r>   r?   s                    r@   r;   zAimv2TextConfig.__init__   s    & 	 	
!#// 3!$;%%%	
 	
 	
 	
 	
 "3!2  ( rA   )rN   rO   rP   rQ   rR   r%   r&   FFr'   NNrS   rT   r(   )rB   rC   rD   rE   rF   rG   rH   rI   r   r;   rJ   rK   s   @r@   rM   rM      s       + +^  !%!##$"#& &*&*!')"&!*  * *  *  	* 
 *  !*  *  !*  *  *  *  sm*  sm*  *  "%*    !*  *  *  *  *  *  *  *  *  * rA   rM   c                   &     e Zd ZdZ	 d fd	Z xZS )Aimv2Configa@  
    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The initial value of the *logit_scale* parameter.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```N   /L
F@c                 l     t                      j        ||fi | || _        || _        d| _        | `d S )Ng      Y@)r:   r;   projection_dimlogit_scale_init_valuemax_logit_scaleinitializer_factor)r=   text_configvision_configra   rb   r>   r?   s         r@   r;   zAimv2Config.__init__  sJ     	m>>v>>>,&<#$###rA   )NNr^   r_   )rB   rC   rD   rE   r;   rJ   rK   s   @r@   r]   r]      sO        + +\ `f$ $ $ $ $ $ $ $ $ $rA   r]   c                       e Zd ZdS )Aimv2OutputNrB   rC   rD   r9   rA   r@   rh   rh   #          DrA   rh   c                       e Zd ZdS )Aimv2RMSNormNri   r9   rA   r@   rl   rl   '  rj   rA   rl   c                       e Zd ZdS )Aimv2MLPNri   r9   rA   r@   rn   rn   +  rj   rA   rn   c                        e Zd Zdef fdZedddej        fdej        fd            Z	dej        dej        fd	Z
 xZS )
Aimv2VisionEmbeddingsconfigc                    t                                                       || _        |j        | _        t	          j        |j        |j        |j        |j                  | _        t          |j        |j
                  | _        |j        |j        z  dz  }| j        j        st	          j        ||j                  | _        |                     dt#          j        |                              d          d           d S )N)kernel_sizestrider   position_ids)   F)
persistent)r:   r;   rq   r/   r   Conv2dr-   r)   patch_embedrl   r0   rms_normr.   r7   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r=   rq   num_patchesr?   s      r@   r;   zAimv2VisionEmbeddings.__init__0  s     +9!3AR[a[l
 
 
 %V%79LMM(F,==!C{$ 	T&(l;@R&S&SD#^U\+-F-F-M-Mg-V-VchiiiiirA      g     @cpureturnc                    t          j        t          |          ||          }t          j        t          |           ||          }t          j        ||d          \  }}|dz  }t          j        |||          |z  }	d||	z  z  }	|                                d         |	d d d f         z  }
|                                d         |	d d d f         z  }t          j        |
                                |
                                |                                |                                gd          d d d d d f         S )	Ndtypedevicexy)indexing   g      ?).Nrv   dim)r   r   rF   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               r@   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding>  s-    c%jjfEEEc&kkvFFFFFFq.WE&AAAGK{E)*  +eD!!!Gn<  +eD!!!Gn<|UYY[[%))++uyy{{EIIKKPVWXXXY]_`_`_`bcbcbcYcddrA   pixel_valuesc                    |                                 \  }}}}|                     |                              d                              dd          }|                     |          }| j        j        r?|                     || j        z  || j        z  | j        j	        |j
        |j                  }n|                     | j                  }||z   }|S )Nr   rv   )r   r   r   )sizerz   r   	transposer{   rq   r7   r   r/   r)   r   r   r}   ru   )r=   r   _r   r   hidden_states	pos_embeds          r@   forwardzAimv2VisionEmbeddings.forwardO  s    *//111fe((66>>qAAKKAqQQm44;  		C??$/)(+1$+#) @  II //0ABBI%	1rA   )rB   rC   rD   r   r;   staticmethodr   float32Tensorr   r   rJ   rK   s   @r@   rp   rp   /  s        j0 j j j j j j !$'%u}e e	e e e \e EL U\        rA   rp   c                       e Zd ZdS )Aimv2TextEmbeddingsNri   r9   rA   r@   r   r   c  rj   rA   r   c                        e Zd Z fdZ xZS )Aimv2Attentionc                    t                                          |           t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        d S )Nbias)
r:   r;   r   Linearr   r2   k_projv_projq_projout_projr=   rq   r?   s     r@   r;   zAimv2Attention.__init__h  s       iV_UUUiV_UUUiV_UUU	$.$.vWWWrA   )rB   rC   rD   r;   rJ   rK   s   @r@   r   r   g  sA        X X X X X X X X XrA   r   c            	       v     e Zd Zdef fdZ	 d	dej        deej                 dee	         dej        fdZ
 xZS )
Aimv2EncoderLayerrq   c                    t                                                       t          |          | _        t	          |          | _        t          |j        |j                  | _	        t          |j        |j                  | _
        d S N)r:   r;   r   	attentionrn   ffnrl   r)   r0   	rms_norm1	rms_norm2r   s     r@   r;   zAimv2EncoderLayer.__init__q  si    '//F##%f&8&:MNN%f&8&:MNNrA   Nr   attention_maskr>   r   c                     |                      |          } | j        d||d|\  }}||z   }|                     |          }|                     |          }||z   }|S )N)r   r   r9   )r   r   r   r   )r=   r   r   r>   norm_hidden_statesattn_outputr   
mlp_outputs           r@   r   zAimv2EncoderLayer.forwardx  sy     "^^M::'r6HYgrrkqrrQ%3!^^M::XX011
%
2rA   r   )rB   rC   rD   r   r;   r   r   r   r   r   r   rJ   rK   s   @r@   r   r   p  s        O0 O O O O O O 26 | !. +,	
 
       rA   r   c                       e Zd ZdS )Aimv2EncoderNri   r9   rA   r@   r   r     rj   rA   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )Aimv2AttentionPoolingHeadrq   c                    t                                                       |j        | _        |j        | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _	        t          j
        t          j        dd| j                            | _        t          j        | j        | j        d          | _        d S )Nr   rv   T)r:   r;   r)   r,   	num_headsr   r   r2   r   r   	Parameterr   zeros	cls_tokenoutput_projr   s     r@   r;   z"Aimv2AttentionPoolingHead.__init__  s    !-3i 0$2BYYYi 0$2BYYYek!Q8H&I&IJJ9T%5t7GdSSSrA   r   r   c                    |j         \  }}}| j                            |dd          }|                     |                              ||| j        || j        z            }|                     |                              ||| j        || j        z            }|                    |d| j        || j        z            }|                    dddd          }|                    dddd          }|                    dddd          }t          j	        |||          }	|	
                    dd                              |d|          }	|	                    d          }	|                     |	          }
|
S )Nrw   rv   r   r   r   r   )shaper   r   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r=   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r@   r   z!Aimv2AttentionPoolingHead.forward  s\   *7*='
GZN))*b"==	kk-((00WdnV`dhdrVrssM**22:wXbfjftXtuu!!*at~A]^^kk!Q1%%aAq))aAq))4UCGG!++Aq1199*aTT!&&1&--!!+..rA   )	rB   rC   rD   r   r;   r   r   r   rJ   rK   s   @r@   r   r     sr        	T0 	T 	T 	T 	T 	T 	TU\ el        rA   r   c                   J     e Zd ZU dZeed<   dZdZg dZdZ	dZ
dZ fdZ xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rq   aimv2T)r   r   rp   r   c                    t                                          |           t          |d          rTt          |j        t
          j                  r3|j        j                            t          j
        d                     d S d S t          |t                    r-|j        j                            d| j        j                   d S d S )Nlogit_scaleg$I$I,@r&   )r   std)r:   _init_weightshasattr
isinstancer   r   r   datafill_mathlogr   r   normal_rq   r5   )r=   moduler?   s     r@   r   z"Aimv2PreTrainedModel._init_weights  s    f%%%6=)) 	W&,bl;; B"'--dhx.@.@AAAAAB B 9:: 	W!))s8U)VVVVV	W 	WrA   )rB   rC   rD   rE   r]   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   rJ   rK   s   @r@   r   r     s          
 &*#   NW W W W W W W W WrA   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc            
            e Zd ZU eed<   dZeedZdef fdZ	de
j        fdZ edd	           ed
          e	 ddeej                 dee         defd                                    Z xZS )Aimv2VisionModelrq   r   r   
attentionsc                 \   t                                          |           || _        t          |          | _        t          |          | _        t          |j        |j	                  | _
        |j        | _        | j        rt          |          | _        |                                  d S r   )r:   r;   rq   rp   
embeddingsr   encoderrl   r)   r0   r{   r6   r   head	post_initr   s     r@   r;   zAimv2VisionModel.__init__  s       /77#F++$V%79LMM= 	:1&99DIrA   r   c                     | j         j        S r   )r   rz   r=   s    r@   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    **rA   r   zv4.58.0)versionFtie_last_hidden_statesNr>   c                     |                      |          } | j        dd|i|}|j        }|                     |          }| j        r|                     |          nd}t          ||          S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```inputs_embedsNlast_hidden_statepooler_outputr9   )r   r   r  r{   r6   r   r	   )r=   r   r   r>   r   encoder_outputsr  r  s           r@   r   zAimv2VisionModel.forward  s    : 55+74< ,
 ,
',
,
 ,

 ,= MM*;<<8<O		"34444)/'
 
 
 	
rA   r   )rB   rC   rD   r   r   main_input_namer   r   _can_record_outputsr;   r   Moduler   r   r   r   r   r   r   r   r   r	   r   rJ   rK   s   @r@   r   r     s         $O*$ 
0      +bi + + + + _%y999u555 26)
 )
 !.)
 +,	)

 
$)
 )
 )
 ^ 65 :9)
 )
 )
 )
 )
rA   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            	            e Zd ZdZeedZdef fdZde	j
        fdZd Z ed	          e	 ddeej                 dee         defd                        Z xZS )Aimv2TextModel	input_idsr   rq   c                 &   t                                          |           || _        t          |          | _        t          |          | _        t          |j        |j	                  | _
        |j        | _        |                                  d S r   )r:   r;   rq   r   r   r   r   rl   r)   r0   r{   rX   r   r   s     r@   r;   zAimv2TextModel.__init__&  sx       -f55#F++$V%79LMM"/rA   r   c                     | j         j        S r   r   token_embeddingr   s    r@   r   z#Aimv2TextModel.get_input_embeddings1  s    ..rA   c                     || j         _        d S r   r  )r=   r   s     r@   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings4  s    */'''rA   Fr   Nr   r>   c                    |                      |          }|j        \  }}}t          j        |t          j        |j                  }|                    d                              |d          }	|t          | j	        ||	||d           } | j
        d	||d|}
|
j        }|                     |          }|t          j        |j        d         |j                  |                    t          j        |j                  | j        k                                                        d          f         }t#          ||          S )
Nr   r   rw   )rq   input_embedsru   r   cache_positionpast_key_values)r  r   )r   r   r  r9   )r   r   r   r   longr   	unsqueezer   r   rq   r   r  r{   torF   rX   argmaxr	   )r=   r  r   r>   r   r   r   r   r  ru   r  r  pooled_outputs                r@   r   zAimv2TextModel.forward7  sl    	22!.!4
GQgUZH\]]]%//2299*bII%/{*)-- $  N '$, 
')
 
 
 
 ,= MM*;<< *L*03<M<TUUU\\	2C2J\KKtO``eeggnnsunvvx

 */'
 
 
 	
rA   r   )rB   rC   rD   r  r   r   r	  rM   r;   r   r
  r   r  r   r   r   r   r   r   r   r	   r   rJ   rK   s   @r@   r  r    s         "O +$ 
	 	 	 	 	 	 	/bi / / / /0 0 0 u555 26'
 '
 !.'
 +,	'

 
$'
 '
 '
 ^ 65'
 '
 '
 '
 '
rA   r  c                       e Zd ZdZdefdZee	 	 	 ddee	j
                 dee	j                 dee	j                 dee         d	ef
d
                        ZdS )
Aimv2ModelTrq   c                    t          j        | |           |j        | _        |j        j        | _        |j        j        | _        t          	                    |j                  | _
        t          	                    |j                  | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        t%          j        | j        j                            | _        t/          j        |j                  | _        |                                  d S )NFr   )r
   r;   ra   rf   r)   vision_embed_dimre   text_embed_dimr   _from_configvision_modelr  
text_modelr   r   visual_projectiontext_projectionr   r   tensorrq   rb   r   r   r   rc   max_log_logit_scaler   )r=   rq   s     r@   r;   zAimv2Model.__init__g  s     v...$3 & 4 @$0<,99&:NOO(55f6HII!#4+@$BU\a!b!b!b!y)<d>QX]^^^<T[5W(X(XYY#'8F,B#C#C rA   Nr  r   r   r>   r   c                     | j         dd|i|} | j        d||d|}|j        }|                     |          }|j        }|                     |          }|t          |          z  }|t          |          z  }| j                            d| j                  	                                
                    |j                  }	|	|z  |                                z  }
|
                                }t          ||
||||          S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r  r   r&   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr9   )r#  r$  r  r%  r&  r   r   clampr(  expr  r   trh   )r=   r  r   r   r>   vision_outputstext_outputsr-  r,  r   r+  r*  s               r@   r   zAimv2Model.forwardy  sK   > 6GT5F 6
 6
%6
6
 6

 4C4? 4
)4
 4
 4
 4
 &3--l;;"0**;77 $&6|&D&DD!$4[$A$AA&,,S$2JKKOOQQTTU`Ughh&48H8HH*,,..-+#%* .
 
 
 	
rA   )NNN)rB   rC   rD   r   r]   r;   r   r   r   r   
LongTensorFloatTensorr   r   r   rh   r   r9   rA   r@   r  r  c  s        {    $  154815	=
 =
E,-=
 u01=
 !.	=

 +,=
 
=
 =
 =
  ^=
 =
 =
rA   r  )r]   r   rM   r   r  r   r  )=rE   r   typingr   r   torch.nn.functionalr   
functionalr   masking_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rM   r]   rh   rl   rn   r
  rp   r   r   r   r   r   r   r   r  r  __all__r9   rA   r@   <module>rG     s    , +                        / / / / / / 9 9 9 9 9 9 K K K K K K K K - - - - - - & & & & & &         
 1 0 0 0 0 0 / / / / / / P P P P P P P P P P 9 9 9 9 9 9 9 9 \ \ \ \ \ \ \ \ \ \ Q Q Q Q Q Q Q Q Q Qa  a  a  a  a * a  a  a HX  X  X  X  X & X  X  X v6$ 6$ 6$ 6$ 6$, 6$ 6$ 6$r	 	 	 	 	, 	 	 		 	 	 	 	< 	 	 		 	 	 	 	x 	 	 	1 1 1 1 1BI 1 1 1h	 	 	 	 	, 	 	 	X X X X X_ X X X    2   2	 	 	 	 	= 	 	 	    	   D W W W W W? W W W8   
E
 E
 E
 E
 E
+ E
 E
 
E
P   
B
 B
 B
 B
 B
) B
 B
 
B
J T
 T
 T
 T
 T
 T
 T
 T
n  rA   