
     `i߼                        d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z
d dlZd dlmZ d dlmc mZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- e e%d           G d de#                                  Z.e e%d           G d de#                                  Z/ee% G d de#                                  Z0 G d dej1                  Z2	 dRdej1        dej3        dej3        dej3        d eej3                 d!e4d"e4fd#Z5 G d$ d%ej1                  Z6 G d& d'ej1                  Z7 G d( d)e          Z8 G d* d+ej1                  Z9 G d, d-ej1                  Z:d. Z;	 dSd2ej3        d3e4d4e4d5e4d6e4d7ej3        fd8Z<dTd;Z=d< Z>d= Z?e% G d> d?e                      Z@ G d@ dAej1                  ZA G dB dCej1                  ZB e%dD           G dE dFe@                      ZC G dG dHej1                  ZD e%dI           G dJ dKe@                      ZEe% G dL dMe@                      ZF e%dN           G dO dPe@                      ZGg dQZHdS )U    N)	dataclass)AnyCallableOptionalUnion)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargs)check_model_inputs   )Siglip2ConfigSiglip2TextConfigSiglip2VisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )Siglip2VisionOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r    r   torchFloatTensor__annotations__r!   r"   tupler#        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/siglip2/modeling_siglip2.pyr   r   +   s          
 15L(5,-44459x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r-   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )Siglip2TextOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr!   .r"   r#   )r$   r%   r&   r'   r1   r   r(   r)   r*   r!   r"   r+   r#   r,   r-   r.   r0   r0   =   s          
 04K%+,33359x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r-   r0   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )Siglip2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Siglip2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Siglip2VisionModel`].
    Nlosslogits_per_imagelogits_per_textr1   r    text_model_outputvision_model_outputreturnc                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r7   r8   N)getattrto_tuple).0kselfs     r.   	<genexpr>z)Siglip2Output.to_tuple.<locals>.<genexpr>n   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r-   )r+   keysr@   s   `r.   r=   zSiglip2Output.to_tuplem   sC     
 
 
 
YY[[
 
 
 
 
 	
r-   )r$   r%   r&   r'   r4   r   r(   r)   r*   r5   r6   r1   r    r7   r   r8   r+   r   r=   r,   r-   r.   r3   r3   O   s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-4444818886:3:::
%* 
 
 
 
 
 
r-   r3   c            	            e Zd Zdef fdZedej        dej        de	dej        fd            Z
dej        dej        dej        fd	Z xZS )
Siglip2VisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        t          j        |j        | j        z  | j        z  | j                  | _	        |j
        | _
        t          | j
        dz            | _        t          j        | j
        | j                  | _        d S )N)in_featuresout_featuresg      ?)super__init__rF   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embeddingr@   rF   	__class__s     r.   rK   z Siglip2VisionEmbeddings.__init__u   s    + +!y+do=O 
  
  

 "-'*4+;S+@'A'A$"$,t/?"P"Pr-   positional_embeddingsspatial_shapes
max_lengthr9   c                 h   |j         d         }| j         d         }| j        }t          j        |||f| j        |          }|                     ddd                              d          } | j        j        dk    r|                     t          j	                  } t          |          D ]}||         \  }}	t          j        | ||	fddd	
          }
|
                    |||	z                                dd          }
|
                    |          }
|
||d||	z  f<   |
d         ||||	z  df<   |S )ac  
        Resize positional embeddings to image-specific size and pad to a fixed size.

        Args:
            positional_embeddings (`torch.Tensor`):
                Position embeddings of shape (height, width, embed_dim)
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
            max_length (`int`):
                Maximum length of the positional embeddings to pad resized positional embeddings to

        Returns:
            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
        r   )devicedtype   r   cpubilinearFT)sizemodealign_corners	antialiasN)shaper`   r(   emptyr_   permute	unsqueezetypetofloat32rangeFinterpolatereshape	transpose)rZ   r[   r\   
batch_sizerM   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r.   resize_positional_embeddingsz4Siglip2VisionEmbeddings.resize_positional_embeddings   s{   ( $)!,
)/3	,2).Y/(/*
 *
 *
& !6 = =aA F F P PQR S S !',55$9$<$<U]$K$K!z"" 	X 	XA*1-MFE!"%e_#" " " "4!;!;IvPU~!V!V!`!`abde!f!f "4!6!6|!D!DBT*1.>.>+>?BTUVBW*1fun.>.>+>??--r-   pixel_valuesc                     | j         j        j        }|                      |                    |                    }| j        j                            | j        | j        d          }|                     |||j        d                   }||z   }|S )aH  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
            spatial_shapes (`list[tuple[int, int]]`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
        )r`   r^   r   )r\   )	rR   weightr`   rm   rW   rr   rU   r{   rh   )r@   r|   r[   target_dtypepatch_embedsrZ   resized_positional_embeddings
embeddingss           r.   forwardzSiglip2VisionEmbeddings.forward   s     +28++LOO,O,O,OPP !% 7 > F F($*F!
 !
 )-(I(I!>l>PQR>S )J )
 )
%
 "$AA
r-   )r$   r%   r&   r   rK   staticmethodr(   Tensor
LongTensorrT   r{   r)   r   __classcell__rY   s   @r.   rE   rE   t   s        Q2 Q Q Q Q Q Q 8.$|8.(8. 8. 
	8. 8. 8. \8.tE$5 uGW \a\h        r-   rE           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr^   )dimr`   )ptrainingr   ra   )r(   matmulrs   rO   
functionalsoftmaxrn   rm   r`   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r-   c            
            e Zd ZdZ fdZ	 ddej        deej                 deej        eej                 f         fdZ	 xZ
S )	Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rJ   rK   rF   rL   rM   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalrO   rP   k_projv_projq_projout_projrX   s     r.   rK   zSiglip2Attention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr-   Nr"   r   r9   c           
         |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| |||	|| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   ra   eagerr   )r   r   r   )rh   r   r   r   viewr   r   rs   r   rF   _attn_implementationr   r   r   r   r   rr   r   r   )r@   r"   r   r   rt   
seq_lengthrM   queriesrB   valuesattention_interfacer   r   s                r.   r   zSiglip2Attention.forward  sy    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00L((r-   N)r$   r%   r&   r'   rK   r(   r   r   r+   r   r   r   s   @r.   r   r      s        GGB B B B B. 26$) $)|$) !.$)
 
u|Xel33	4$) $) $) $) $) $) $) $)r-   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )
Siglip2MLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )rJ   rK   rF   r
   
hidden_actactivation_fnrO   rP   rL   intermediate_sizefc1fc2rX   s     r.   rK   zSiglip2MLP.__init__/  sf    #F$569V/1IJJ9V5v7IJJr-   r"   r9   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )r@   r"   s     r.   r   zSiglip2MLP.forward6  s=    //**=99//r-   )r$   r%   r&   rK   r(   r   r   r   r   s   @r.   r   r   .  sc        K K K K KU\ el        r-   r   c            	            e Zd Zdeeef         f fdZedej	        dej	        de
e         dej        fd            Z xZS )Siglip2EncoderLayerrF   c                 D   t                                                       |j        | _        t	          j        | j        |j                  | _        t          |          | _	        t	          j        | j        |j                  | _
        t          |          | _        d S Neps)rJ   rK   rL   rM   rO   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlprX   s     r.   rK   zSiglip2EncoderLayer.__init__>  s}    +<F<QRRR)&11<F<QRRRf%%r-   r"   r   r   r9   c                     |}|                      |          } | j        d||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)r"   r   r,   )r   r   r   r   )r@   r"   r   r   residual_s         r.   r   zSiglip2EncoderLayer.forwardF  s     !((77)4> 
')
 
 
 
q
 !=0 ((77// =0r-   )r$   r%   r&   r   r   r   rK   r   r(   r   r   r   r)   r   r   r   s   @r.   r   r   =  s        &u%8:K%KL & & & & & & |  +,	
 
	   ^    r-   r   c                   r     e Zd ZdZdef fdZe	 d	deej	                 de
e         defd            Z xZS )
Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Siglip2EncoderLayer`].

    Args:
        config: Siglip2Config
    rF   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r,   )r   )r>   r   rF   s     r.   
<listcomp>z+Siglip2Encoder.__init__.<locals>.<listcomp>k  s"    $j$j$jQ%8%@%@$j$j$jr-   F)	rJ   rK   rF   rO   
ModuleListro   num_hidden_layerslayersgradient_checkpointingrX   s    `r.   rK   zSiglip2Encoder.__init__h  sa    m$j$j$j$j%PVPhJiJi$j$j$jkk&+###r-   Nr   r   r9   c                 N    |}| j         D ]} |||fi |}t          |          S )N)r!   )r   r   )r@   inputs_embedsr   r   r"   encoder_layers         r.   r   zSiglip2Encoder.forwardo  sU     &![ 	 	M)M   MM ????r-   r   )r$   r%   r&   r'   r   rK   r   r   r(   r   r   r   r   r   r   r   s   @r.   r   r   _  s         ,} , , , , , ,  26@ @ !.@ +,	@
 
@ @ @ ^@ @ @ @ @r-   r   c                        e Zd Zdef fdZe	 	 ddej        dej        dej	        de
e         de
e         d	efd
            Z xZS )Siglip2VisionTransformerrF   c                 j   t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        t          |d          sdn|j        | _        | j        rt          |          | _        d S d S )Nr   vision_use_headT)rJ   rK   rF   rL   rE   r   r   encoderrO   r   r   post_layernormhasattrr   use_head$Siglip2MultiheadAttentionPoolingHeadheadr@   rF   rM   rY   s      r.   rK   z!Siglip2VisionTransformer.__init__  s    &	1&99%f-- l9&:OPPP$+F4E$F$FbFLb= 	E<VDDDIII	E 	Er-   Nr|   r   r[   output_attentionsoutput_hidden_statesr9   c                    ||n| j         j        }||n| j         j        }|                     ||          }|&| j         j        dk    rt          ||j                  }n|}|                     ||||          }|j        }	| 	                    |	          }	| j
        r|                     |	|          nd}
t          |	|
|j        |j                  S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        Nflash_attention_2)r   r   r   r   )r!   pooler_outputr"   r#   )rF   r   r   r   r   r   r`   r   r!   r   r   r   r   r"   r#   )r@   r|   r   r[   r   r   r"   encoder_attention_maskencoder_outputsr!   r   s              r.   r   z Siglip2VisionTransformer.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 nEE%$+*JNa*a*a%?P]Pc%d%d""%3"+/<<'1/!5	 ,8 ,
 ,
 ,= //0ABBHL_		"3^DDD[_)/')7&1	
 
 
 	
r-   NN)r$   r%   r&   r   rK   r   r(   r)   r   r   r   boolr   r   r   r   s   @r.   r   r     s        
E2 
E 
E 
E 
E 
E 
E  -1/3*
 *
'*
 *
 (	*

 $D>*
 'tn*
 
$*
 *
 *
 ^*
 *
 *
 *
 *
r-   r   c                    d }||d|z  z
  k     s||d|z  z   k    rt          j        dd            |||z
  |z            } |||z
  |z            }|                     d|z  dz
  d|z  dz
             |                                  |                     |t          j        d          z             |                     |           |                     ||           d S )Nc                 `    dt          j        | t          j        d          z            z   dz  S )N      ?       @)matherfsqrt)xs    r.   norm_cdfz _trunc_normal_.<locals>.norm_cdf  s)    dhq49S>>1222c99r-   ra   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r   )minmax)	warningswarnuniform_erfinv_mul_r   r   add_clamp_)tensormeanstdabr   lus           r.   _trunc_normal_r    s   : : : 	q1s7{q1s7{ 2 2;	
 	
 	
 	
 	!d(c!""A!d(c!""A OOAEAIq1uqy))) NN KKdinn$%%%
KK MMaQMr-   r          r   r  r  r  r  r  r9   c                     t          j                    5  t          | dd||           |                     |                              |           ddd           dS # 1 swxY w Y   dS )an  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(	ext{mean}, 	ext{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq 	ext{mean} \leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    r   r   N)r(   no_gradr  r   r   )r  r  r  r  r  s        r.   trunc_normal_tf_r    s    * 
 $ $vq#q!,,,Cd###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   <AA!$A!fan_innormalc                 l   t          |           \  }}|dk    r|}n|dk    r|}n|dk    r||z   dz  }||z  }|dk    r(t          | t          j        |          dz             d S |dk    rVt	          j                    5  |                     t          j        |                     d d d            d S # 1 swxY w Y   d S |d	k    r\t          j        d
|z            }t	          j                    5  |                     | |           d d d            d S # 1 swxY w Y   d S t          d|           )Nr  fan_outfan_avgra   truncated_normalg۶%?r  r  uniformr	   zinvalid distribution )	r   r  r   r   r(   r  normal_r   r   )	r  r   re   distributionr  r  denomvariancebounds	            r.   variance_scaling_r    s   3F;;OFGx						'!Q&u}H)))TYx%8%8;N%NOOOOOO		!	!]__ 	4 	4NNty22N333	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4		"	"	!h,'']__ 	+ 	+OOUFE***	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ ???@@@s$   ?)B55B9<B92DDDc                 *    t          | dd           d S )Nr  r  re   r  r  r  s    r.   lecun_normal_r    s    f8:LMMMMMMr-   c                 *    t          | dd           d S )Nr  r  r  r  r  s    r.   default_flax_embed_initr!    s    f8(CCCCCCr-   c                   J    e Zd ZU eed<   dZdZg dZdZdZ	dZ
dZeedZd ZdS )Siglip2PreTrainedModelrF   siglip2T)Siglip2TextEmbeddingsrE   r   r   )r"   r#   c                 2
   t          |t                    ryt          | j        t                    r| j        j        j        n| j        j        }t          j                            |j	        j
        dt          j        |          z             dS t          |t          j                  rt          |j
                   dS t          |t                    rJt          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j                   t          j                            |j        j                   t          j                            |j        j                   t          j                            |j        j                   dS t          |t.                    rt          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j        d           t          j                            |j        j        d           dS t          |t4                    rt          j                            |j        j                   t          j                            |j        j        j                   t          j                            |j        j        j                   dS t          |t@                    retC          j"        tC          j#        d                    }|j$        j        %                    |           |j&        j        '                                 dS t          |tP                    rLt          j                            |j)        j
        | j        j        j        dz  | j        j*        z             dS t          |t          j+        t          j,        f          rCt[          |j
                   |j        &t          j                            |j                   dS dS t          |t          j.                  r?|j        j        '                                 |j
        j        %                    d           dS dS )zInitialize the weightsr   r  gư>r   r   N)/
isinstancerE   rF   r   vision_configrL   rO   initr  rW   r~   npr   rV   r!  r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probedata	attentionin_proj_weightin_proj_biasSiglip2Modelr(   logr  logit_scalefill_
logit_biaszero_Siglip2ForImageClassification
classifierinitializer_factorrP   Conv2dr  r   )r@   r   ry   logit_scale_inits       r.   _init_weightsz$Siglip2PreTrainedModel._init_weights1  s   f566 *	* dk=99-)55[, 
 GOOF5<!bgennBTOUUUUU-- #	*#FM22222 011 !	*G##FM$8999G##FM$8999G##FM$8999G##FO$:;;;GNN6=-...GNN6=-...GNN6=-...GNN6?/00000
++ 	*G##FJ$5666G##FJ$5666GOOFJOO666GOOFJOO66666 DEE 	*G##FL$5666G##F$4$C$HIIIGNN6+8=>>>>>-- 	*$yc):):;;#))*:;;;"((***** =>> 	*GOO!(K-94?$+B``       BI 677 	*&-((({&v{+++++ '&-- 	*K""$$$M$$S)))))	* 	*r-   N)r$   r%   r&   r   r*   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr>  r,   r-   r.   r#  r#    s}         !&*#    N"& -& 
,* ,* ,* ,* ,*r-   r#  c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
r%  rF   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nposition_ids)r   r^   F)
persistent)rJ   rK   rL   rO   rV   
vocab_sizetoken_embeddingmax_position_embeddingsrW   register_bufferr(   arangeexpandr   s      r.   rK   zSiglip2TextEmbeddings.__init__a  s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r-   N	input_idsrI  r   r9   c                 .   ||j         d         n|j         d         }| j        j        j         d         }||k    rt          d| d|           || j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nr^   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rh   rW   r~   r   rI  rL  )r@   rQ  rI  r   r   max_position_embeddingposition_embeddingsr   s           r.   r   zSiglip2TextEmbeddings.forwardm  s     -6,AY_R((}GZ[]G^
!%!8!?!Ea!H...VV V=SV V  
 ,QQQ^<L  00;;M"55lCC"%88
r-   NNN)r$   r%   r&   r   rK   r   r(   r   r)   r   r   r   r   s   @r.   r%  r%  `  s        

0 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r-   r%  c                        e Zd Zdef fdZee	 	 	 d
deej	                 deej	                 deej	                 de
e         def
d	                        Z xZS )Siglip2TextTransformerrF   c                 2   t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        t          j        ||j                  | _        d S r   )rJ   rK   rF   rL   r%  r   r   r   rO   r   r   final_layer_normrP   projection_sizer   r   s      r.   rK   zSiglip2TextTransformer.__init__  sz    &	/77%f-- "YF<Q R R RIi)?@@			r-   NrQ  r   rI  r   r9   c                    |t          d          |                                }|                    d|d                   }|                     ||          }d| j        j        v }|rd }n||st          ||j                  } | j        d||d|}|j	        }	| 
                    |	          }	|	d d dd d f         }
|                     |
          }
t          |	|
          S )NzYou have to specify input_idsr^   )rQ  rI  flash)r   r   )r!   r   r,   )r   rd   r   r   rF   r   r   r`   r   r!   rY  r   r   )r@   rQ  r   rI  r   input_shaper"   uses_flash_attentionr   r!   pooled_outputs              r.   r   zSiglip2TextTransformer.forward  s)    <===nn&&NN2{277	),WW  '$+*JJ 	]!NN'0D'7H[\\N+74< ,
'),
 ,
 ,
 ,
 ,= 112CDD *!!!R(3		-00)/'
 
 
 	
r-   rU  )r$   r%   r&   r   rK   r   r   r   r(   r   r   r   r   r   r   r   s   @r.   rW  rW    s        A0 A A A A A A  -115/3	(
 (
EL)(
 !.(
 u|,	(

 +,(
 
$(
 (
 (
 ^ (
 (
 (
 (
 (
r-   rW  zL
    The text model from Siglip2 without any head or projection on top.
    c                        e Zd ZU eed<   def fdZdej        fdZd Z	 e
d          e	 	 	 dd	eej                 d
eej                 deej                 dee         def
d                        Z xZS )Siglip2TextModelrF   c                     t                                          |           t          |          | _        |                                  d S r   )rJ   rK   rW  
text_model	post_initrX   s     r.   rK   zSiglip2TextModel.__init__  s@       088r-   r9   c                 $    | j         j        j        S r   rc  r   rL  rC   s    r.   get_input_embeddingsz%Siglip2TextModel.get_input_embeddings  s    )99r-   c                 (    || j         j        _        d S r   rf  )r@   r   s     r.   set_input_embeddingsz%Siglip2TextModel.set_input_embeddings  s    5:"222r-   Ftie_last_hidden_statesNrQ  r   rI  r   c                 $     | j         d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, Siglip2TextModel

        >>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```rQ  r   rI  r,   )rc  )r@   rQ  r   rI  r   s        r.   r   zSiglip2TextModel.forward  s7    4 t 
)%
 
 	
 
 	
r-   rU  )r$   r%   r&   r   r*   rK   rO   Modulerg  ri  r   r   r   r(   r   r   r   r   r   r   r   s   @r.   ra  ra    s         0      :bi : : : :; ; ; u555 -115/3	
 
EL)
 !.
 u|,	

 +,
 
$
 
 
 ^ 65
 
 
 
 
r-   ra  c                   h     e Zd ZdZdef fdZd	dej        deej                 dej        fdZ	 xZ
S )
r   zMultihead Attention Pooling.rF   c                    t                                                       t          j        t	          j        dd|j                            | _        t          j                            |j        |j	        d          | _
        t          j        |j        |j                  | _        t          |          | _        |j	        | _        d S )Nr   T)batch_firstr   )rJ   rK   rO   	Parameterr(   randnrL   r.  MultiheadAttentionr   r0  r   r   	layernormr   r   r   rX   s     r.   rK   z-Siglip2MultiheadAttentionPoolingHead.__init__  s    \%+aF4F"G"GHH
44V5GIcqu4vvf&8f>STTTf%%3r-   Nhidden_stater   r9   c                    |j         d         }| j                            |dd          }|d|j         d         |j         d         }}t          ||j        |          }|                    d| j        |d          }|                    d||          }|                     ||||          d         }|}|                     |          }|| 	                    |          z   }|d d df         S )Nr   r   r^   )	attn_mask)
rh   r.  repeatr   r`   r   rr   r0  ru  r   )r@   rv  r   rt   r.  
target_len
source_lenr   s           r.   r   z,Siglip2MultiheadAttentionPoolingHead.forward  s    !'*

!!*a33%%*[^\5G5J
J7HZ\fggN+221dnjRSTTN+33B
JOON~~e\<Sa~bbcde~~l33$((<"8"88AAAqD!!r-   r   )r$   r%   r&   r'   r   rK   r(   r   r   r   r   r   s   @r.   r   r     s        &&42 4 4 4 4 4 4" "EL "(5<BX "didp " " " " " " " "r-   r   zN
    The vision model from Siglip2 without any head or projection on top.
    c                        e Zd ZU eed<   dZdef fdZdej        fdZ	 e
d          e	 	 ddej        d	ej        d
ej        dee         dee         defd                        Z xZS )Siglip2VisionModelrF   r|   c                     t                                          |           t          |          | _        |                                  d S r   )rJ   rK   r   vision_modelrd  rX   s     r.   rK   zSiglip2VisionModel.__init__  sC       4V<< 	r-   r9   c                 $    | j         j        j        S r   )r  r   rR   rC   s    r.   rg  z'Siglip2VisionModel.get_input_embeddings&  s     +;;r-   Frj  Npixel_attention_maskr[   r   r   c                 6    |                      |||||          S )a9  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r|   r   r[   r   r   )r  )r@   r|   r  r[   r   r   s         r.   r   zSiglip2VisionModel.forward)  s1    F   %/)/!5 ! 
 
 	
r-   r   )r$   r%   r&   r   r*   main_input_namerK   rO   rn  rg  r   r   r(   r)   r   r   r   r   r   r   r   r   s   @r.   r}  r}    s          $O2      <bi < < < < u555 -1/3'
 '
''
 $l'
 (	'

 $D>'
 'tn'
 
$'
 '
 '
 ^ 65'
 '
 '
 '
 '
r-   r}  c                   f    e Zd ZU eed<   def fdZ e            e	 	 ddej	        de
ej	                 de
ej	                 dej        fd                        Z e            e	 	 	 dd	e
ej                 d
e
ej	                 de
ej                 dej        fd                        Zee	 	 	 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej	                 de
ej                 de
ej	                 de
ej                 de
e         de
e         de
e         defd                        Z xZS )r3  rF   c                    t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }t          
                    |          }t          
                    |          }|j        | _        |j        | _        t          j        t!          j        d                    | _        t          j        t!          j        d                    | _        |                                  d S )NzNconfig.text_config is expected to be of type Siglip2TextConfig but is of type .zRconfig.vision_config is expected to be of type Siglip2VisionConfig but is of type r   )rJ   rK   r'  text_configr   	TypeErrorrl   r(  r   ra  _from_configr}  rc  r  rO   rr  r(   rs  r5  r7  rd  )r@   rF   r  r(  rc  r  rY   s         r.   rK   zSiglip2Model.__init__Y  sM      &,.?@@ 	0+,,0 0 0  
 &.0CDD 	2-..2 2 2  
 (, &22;??
)66}EE %/(5<A77,u{1~~66 	r-   NrQ  r   rI  r9   c                 D    |                      |||          }|j        }|S )aM  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Siglip2TextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```rm  )rc  r   )r@   rQ  r   rI  text_outputsr_  s         r.   get_text_featureszSiglip2Model.get_text_featuresy  s6    6 48??)% 4C 4
 4

 %2r-   r|   r  r[   c                 D    |                      |||          }|j        }|S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Siglip2VisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```
        )r|   r   r[   )r  r   )r@   r|   r  r[   vision_outputsr_  s         r.   get_image_featureszSiglip2Model.get_image_features  s9    J 6:5F5F%/) 6G 6
 6

 '4r-   return_lossr   r   c
           	         ||n| j         j        }|	|	n| j         j        }	|                     |||||	          }
|                     |||||	          }|
j        }|j        }||                    ddd          z  }||                    ddd          z  }t          j        ||	                                
                    |j                            }| j        
                    |j                  | j        
                    |j                  }}||                                z  |z   }|	                                }d}|rt          j        |                    d          |j        	          }t          j        |           d|z  z   }t          j        j                            ||z            }t          j        |d
           }|                                }t/          |||||||
          S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```
        Nr  )rQ  r   rI  r   r   ra   r^   T)r   r   keepdimr   )r_   r   )r4   r5   r6   r1   r    r7   r8   )rF   r   r   r  rc  r   normr(   r   trm   r_   r5  r7  expeyerd   	ones_likerO   r   
logsigmoidsumr  r3   )r@   rQ  r|   r  r[   r   rI  r  r   r   r  r  r    r1   r6   r5  r7  r5   r4   r  m1_diag1logliknlls                          r.   r   zSiglip2Model.forward  s"   d 2C1N--TXT_Tq$8$D  $+Jj 	 6:5F5F%/)/!5 6G 6
 6
 48??)%/!5 4C 4
 4
 &3"0 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO  ,{LNN4D4D4G4GHZ4[4[\\"&"2"5"5k6H"I"I4?K]K]^i^pKqKqZ)KOO,=,==
J*,,.. 	)O0033O<RSSSC8881s7BHX(33H4NOOF9V,,,,C88::D-+#%* .
 
 
 	
r-   r   rU  )	NNNNNNNNN)r$   r%   r&   r   r*   rK   r   r   r(   r   r   r)   r  r   r  r   r   r3   r   r   r   s   @r.   r3  r3  U  sO        }      @ %$&& 26/3	   <  !.  u|,	 
 
	      ^ '& D %$&& 597;59	* *u01* 'u|4* !!12	*
 
	* * * ^ '&*Z  15487;591537&*,0/3e
 e
E,-e
 u01e
 'u|4	e

 !!12e
 !.e
 u/0e
 d^e
 $D>e
 'tne
 
e
 e
 e
 ^ e
 e
 e
 e
 e
r-   r3  z
    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdeddf fdZee	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j                 dee	j
                 d	ee         d
ee         defd                        Z xZS )r9  r|   rF   r9   Nc                 n   t                                          |           |j        | _        t                              |j                  }|j        | _        |j        dk    r$t          j        |j        j	        |j                  nt          j
                    | _        |                                  d S )Nr   )rJ   rK   
num_labelsr}  r  r(  r  rO   rP   rL   Identityr:  rd  )r@   rF   r  rY   s      r.   rK   z&Siglip2ForImageClassification.__init__?  s        + *66v7KLL(5 OUN_bcNcNcBIf*68IJJJikitiviv 	
 	r-   r  r[   labelsr   r   c                    ||n| j         j        }||n| j         j        }|                     |||||          }|j        }|Q|d                             |j                  }	t          j        ||	z  d          t          j        |	d          z  }nt          j	        |d          }| 
                    |          }
d}||                     ||
| j                   }t          ||
|j        |j                  S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a `Siglip2Model` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
        >>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```
        N)r   r[   r   r   ).Nr   r  )r4   logitsr"   r#   )rF   r   r   r  r!   rm   r_   r(   r  r  r:  loss_functionr   r"   r#   )r@   r|   r  r[   r  r   r   outputssequence_output	pool_maskr  r4   s               r.   r   z%Siglip2ForImageClassification.forwardQ  s4   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 /3.?.?/)/!5 /@ /
 /
 "3  +,Y7::?;QRRI#i)(CKKKeiXaghNiNiNiiOO#ja@@@O 11%%ffdkBBD$!/)	
 
 
 	
r-   )NNNNNN)r$   r%   r&   r  r   rK   r   r   r   r(   r   r   r   r   r   r   r   s   @r.   r9  r9  6  s        %O}       $  047;59)-,0/3O
 O
u|,O
 'u|4O
 !!12	O

 &O
 $D>O
 'tnO
 
O
 O
 O
 ^ O
 O
 O
 O
 O
r-   r9  )r3  r#  ra  r}  r9  )r   )r   r   r	  r   )r   r  r  )Ir   r   dataclassesr   typingr   r   r   r   numpyr*  r(   torch.nnrO   torch.nn.functionalr   rp   torch.nn.initr   activationsr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   configuration_siglip2r   r   r   r   r0   r3   rn  rE   r   floatr   r   r   r   r   r   r  r  r  r  r!  r#  r%  rW  ra  r   r}  r3  r9  __all__r,   r-   r.   <module>r     s  *   ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1                     7 7 7 7 7 7 ! ! ! ! ! ! B B B B B B 9 9 9 9 9 9 b b b b b b b b b b F F F F F F F F & & & & & & w w w w w w w w w w w w w w / / / / / / X X X X X X X X X X   
	? 	? 	? 	? 	?+ 	? 	?  	?   
	? 	? 	? 	? 	? 	? 	?  	?  
  
  
  
  
K  
  
   
Fb b b b bbi b b bX % %I%<% 
% <	%
 U\*% % % % % %.;) ;) ;) ;) ;)ry ;) ;) ;)|           4   D@ @ @ @ @RY @ @ @D8
 8
 8
 8
 8
ry 8
 8
 8
v!  !  ! J \_$ $L$ %$27$BG$SX$
\$ $ $ $4A A A A2N N ND D D A* A* A* A* A*_ A* A* A*H% % % % %BI % % %P5
 5
 5
 5
 5
RY 5
 5
 5
p   
.
 .
 .
 .
 .
- .
 .
 
.
b" " " " "29 " " ">   
8
 8
 8
 8
 8
/ 8
 8
 
8
v ]
 ]
 ]
 ]
 ]
) ]
 ]
 ]
@   f
 f
 f
 f
 f
$: f
 f
 f
R  r-   