
     `i]                        d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ d Z,	 dTdej-        de.de.de.de.dej-        fdZ/dUd Z0d! Z1d" Z2e e"d#$           G d% d&e                                   Z3e e"d'$           G d( d)e                                   Z4ee" G d* d+e                                   Z5 G d, d-ej6                  Z7 G d. d/ej6                  Z8	 dVd0ej6        d1ej-        d2ej-        d3ej-        d4eej-                 d5e.d6e.fd7Z9 G d8 d9ej6                  Z: G d: d;ej6                  Z; G d< d=e          Z<e" G d> d?e                      Z= G d@ dAej6                  Z> G dB dCej6                  Z? e"dD$           G dE dFe=                      Z@ G dG dHej6                  ZA G dI dJej6                  ZB e"dK$           G dL dMe=                      ZCe" G dN dOe=                      ZD e"dP$           G dQ dRe=                      ZEg dSZFdS )WzPyTorch Siglip model.    N)	dataclass)AnyCallableOptionalUnion)nn)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargs	torch_int)check_model_inputs   )SiglipConfigSiglipTextConfigSiglipVisionConfigc                    d }||d|z  z
  k     s||d|z  z   k    rt          j        dd            |||z
  |z            } |||z
  |z            }|                     d|z  dz
  d|z  dz
             |                                  |                     |t          j        d          z             |                     |           |                     ||           d S )Nc                 `    dt          j        | t          j        d          z            z   dz  S )N      ?       @)matherfsqrt)xs    ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/siglip/modeling_siglip.pynorm_cdfz _trunc_normal_.<locals>.norm_cdf0   s)    dhq49S>>1222c99       zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r"   )minmax)	warningswarnuniform_erfinv_mul_r#   r%   add_clamp_)tensormeanstdabr(   lus           r'   _trunc_normal_r<   -   s   : : : 	q1s7{q1s7{ 2 2;	
 	
 	
 	
 	!d(c!""A!d(c!""A OOAEAIq1uqy))) NN KKdinn$%%%
KK MMaQMr)           r!          r"   r5   r6   r7   r8   r9   returnc                     t          j                    5  t          | dd||           |                     |                              |           ddd           dS # 1 swxY w Y   dS )an  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(	ext{mean}, 	ext{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq 	ext{mean} \leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    r   r!   N)torchno_gradr<   r2   r3   )r5   r6   r7   r8   r9   s        r'   trunc_normal_tf_rC   Q   s    * 
 $ $vq#q!,,,Cd###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $s   <AA!$A!fan_innormalc                 l   t          |           \  }}|dk    r|}n|dk    r|}n|dk    r||z   dz  }||z  }|dk    r(t          | t          j        |          dz             d S |dk    rVt	          j                    5  |                     t          j        |                     d d d            d S # 1 swxY w Y   d S |d	k    r\t          j        d
|z            }t	          j                    5  |                     | |           d d d            d S # 1 swxY w Y   d S t          d|           )NrD   fan_outfan_avgr*   truncated_normalg۶%?r7   rE   uniformr
   zinvalid distribution )	r	   rC   r#   r%   rA   rB   normal_r0   
ValueError)	r5   scalemodedistributionrD   rG   denomvariancebounds	            r'   variance_scaling_rT   k   s   3F;;OFGx						'!Q&u}H)))TYx%8%8;N%NOOOOOO		!	!]__ 	4 	4NNty22N333	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4 	4		"	"	!h,'']__ 	+ 	+OOUFE***	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ ???@@@s$   ?)B55B9<B92DDDc                 *    t          | dd           d S )NrD   rI   rO   rP   rT   r5   s    r'   lecun_normal_rY      s    f8:LMMMMMMr)   c                 *    t          | dd           d S )NrD   rE   rV   rW   rX   s    r'   default_flax_embed_initr[      s    f8(CCCCCCr)   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )SiglipVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r_   r   rA   FloatTensor__annotations__r`   ra   tuplerb    r)   r'   r^   r^      s          
 15L(5,-44459x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r)   r^   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )SiglipTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr`   .ra   rb   )rc   rd   re   rf   rm   r   rA   rg   rh   r`   ra   ri   rb   rj   r)   r'   rl   rl      s          
 04K%+,33359x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r)   rl   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )SiglipOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipVisionModel`].
    Nlosslogits_per_imagelogits_per_textrm   r_   text_model_outputvision_model_outputr?   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))rs   rt   N)getattrto_tuple).0kselfs     r'   	<genexpr>z(SiglipOutput.to_tuple.<locals>.<genexpr>   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r)   )ri   keysr{   s   `r'   rx   zSiglipOutput.to_tuple   sC     
 
 
 
YY[[
 
 
 
 
 	
r)   )rc   rd   re   rf   rp   r   rA   rg   rh   rq   rr   rm   r_   rs   r   rt   ri   r   rx   rj   r)   r'   ro   ro      s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-4444818886:3:::
%* 
 
 
 
 
 
r)   ro   c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )SiglipVisionEmbeddingsconfigc                    t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        |j	        | j        | j        | j        d          | _
        | j        | j        z  dz  | _        | j        | _        t          j        | j        | j                  | _        |                     dt!          j        | j                                      d          d           d S )Nvalid)in_channelsout_channelskernel_sizestridepaddingr*   position_idsr   F
persistent)super__init__r   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferrA   arangeexpandr{   r   	__class__s     r'   r   zSiglipVisionEmbeddings.__init__   s    + + +!y+? 
  
  
 !Ot>1D!-"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr)   
embeddingsheightwidthr?   c                 ~   |j         d         }| j        j        j         d         }t          j                                        s&||k    r ||k    r|                     | j                  S | j        j                            d          }|j         d         }|| j        z  }|| j        z  }	t          |dz            }
|
                    d|
|
|          }|                    dddd          }t          j                            |||	fdd	          }|                    dddd                              dd|          }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   r   g      ?r
   r*   bicubicF)sizerO   align_corners)shaper   weightrA   jit
is_tracingr   	unsqueezer   r   reshapepermuter   
functionalinterpolateview)r{   r   r   r   r   r   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r'   interpolate_pos_encodingz/SiglipVisionEmbeddings.interpolate_pos_encoding   sL    !&q)/6<Q? y##%% 	>+*F*F6UZ??**4+<===18BB1EEr"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNr)   Fpixel_valuesc                 X   |j         \  }}}}| j        j        j        }|                     |                    |                    }|                    d                              dd          }|r||                     |||          z   }n||                     | j	                  z   }|S )N)dtyper*   r   )
r   r   r   r   toflatten	transposer   r   r   )	r{   r   r   _r   r   target_dtypepatch_embedsr   s	            r'   forwardzSiglipVisionEmbeddings.forward  s    *01fe+28++LOO,O,O,OPP!))!,,66q!<<
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr)   F)rc   rd   re   r   r   rA   Tensorintr   rg   r   __classcell__r   s   @r'   r   r      s        q1 q q q q q q($5< $ $UX $]b]i $ $ $ $L
 
E$5 
Z_Zf 
 
 
 
 
 
 
 
r)   r   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
SiglipTextEmbeddingsr   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nr   r   Fr   )r   r   r   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   rA   r   r   r{   r   r   r   s      r'   r   zSiglipTextEmbeddings.__init__"  s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r)   N	input_idsr   inputs_embedsr?   c                 .   ||j         d         n|j         d         }| j        j        j         d         }||k    rt          d| d|           || j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nr   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   rM   r   r   )r{   r   r   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r'   r   zSiglipTextEmbeddings.forward.  s     -6,AY_R((}GZ[]G^
!%!8!?!Ea!H...VV V=SV V  
 ,QQQ^<L  00;;M"55lCC"%88
r)   NNN)rc   rd   re   r   r   r   rA   
LongTensorrg   r   r   r   r   s   @r'   r   r   !  s        

/ 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r)   r   modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr   r   )r   r   )ptrainingr   r*   )rA   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r'   eager_attention_forwardr   I  s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r)   c            
            e Zd ZdZ fdZ	 ddej        deej                 deej        eej                 f         fdZ	 xZ
S )	SiglipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   r   num_attention_heads	num_headshead_dimrM   rN   attention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r'   r   zSiglipAttention.__init__c  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr)   Nra   r   r?   c           
         |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| |||	|| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   r*   eagerr=   )r   r   r   )r   r   r   r   r   r   r   r   r   r   _attn_implementationr   r   rN   r   r   r   r   r   )r{   ra   r   r   
batch_sizer   r   queriesr}   valuesattention_interfacer   r   s                r'   r   zSiglipAttention.forwardw  sy    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00L((r)   N)rc   rd   re   rf   r   rA   r   r   ri   r   r   r   s   @r'   r   r   `  s        GGB B B B B. 26$) $)|$) !.$)
 
u|Xel33	4$) $) $) $) $) $) $) $)r)   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )	SiglipMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )r   r   r   r   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r'   r   zSiglipMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr)   ra   r?   c                     |                      |          }|                     |          }|                     |          }|S r   )r  r  r  )r{   ra   s     r'   r   zSiglipMLP.forward  s=    //**=99//r)   )rc   rd   re   r   rA   r   r   r   r   s   @r'   r   r     sc        K K K K KU\ el        r)   r   c            	            e Zd Zdeeef         f fdZedej	        dej	        de
e         dej        fd            Z xZS )SiglipEncoderLayerr   c                 D   t                                                       |j        | _        t	          j        | j        |j                  | _        t          |          | _	        t	          j        | j        |j                  | _
        t          |          | _        d S Neps)r   r   r   r   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr   s     r'   r   zSiglipEncoderLayer.__init__  s}    +<F<QRRR(00<F<QRRRV$$r)   ra   r   r   r?   c                     |}|                      |          } | j        d||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|S )N)ra   r   rj   )r  r  r  r  )r{   ra   r   r   residualr   s         r'   r   zSiglipEncoderLayer.forward  s     !((77)4> 
')
 
 
 
q
 !=0 ((77// =0r)   )rc   rd   re   r   r   r   r   r   rA   r   r   r   rg   r   r   r   s   @r'   r  r    s        %u%79I%IJ % % % % % % |  +,	
 
	   ^    r)   r  c                   J    e Zd ZU eed<   dZdZg dZdZdZ	dZ
dZeedZd ZdS )SiglipPreTrainedModelr   siglipT)r   r   r  #SiglipMultiheadAttentionPoolingHead)ra   rb   c                 2
   t          |t                    ryt          | j        t                    r| j        j        j        n| j        j        }t          j                            |j	        j
        dt          j        |          z             dS t          |t          j                  rt          |j
                   dS t          |t                    rJt          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j                   t          j                            |j        j                   t          j                            |j        j                   t          j                            |j        j                   dS t          |t.                    rt          j                            |j        j
                   t          j                            |j        j
                   t          j                            |j        j        d           t          j                            |j        j        d           dS t          |t4                    rt          j                            |j        j                   t          j                            |j        j        j                   t          j                            |j        j        j                   dS t          |t@                    retC          j"        tC          j#        d                    }|j$        j        %                    |           |j&        j        '                                 dS t          |tP                    rLt          j                            |j)        j
        | j        j        j        dz  | j        j*        z             dS t          |t          j+        t          j,        f          rCt[          |j
                   |j        &t          j                            |j                   dS dS t          |t          j.                  r?|j        j        '                                 |j
        j        %                    d           dS dS )zInitialize the weightsr   rJ   gư>r!   r   N)/
isinstancer   r   r   vision_configr   r   initrL   r   r   npr%   r   r[   r   xavier_uniform_r   r   r   r   zeros_biasr   r  r  r  probedata	attentionin_proj_weightin_proj_biasSiglipModelrA   logr5   logit_scalefill_
logit_biaszero_SiglipForImageClassification
classifierinitializer_factorr   r   rY   r  )r{   r   r   logit_scale_inits       r'   _init_weightsz#SiglipPreTrainedModel._init_weights  s   f455 *	* dk<88-)55[, 
 GOOF5<!bgennBTOUUUUU-- #	*#FM2222200 !	*G##FM$8999G##FM$8999G##FM$8999G##FO$:;;;GNN6=-...GNN6=-...GNN6=-...GNN6?/00000	** 	*G##FJ$5666G##FJ$5666GOOFJOO666GOOFJOO66666 CDD 	*G##FL$5666G##F$4$C$HIIIGNN6+8=>>>>>,, 	*$yc):):;;#))*:;;;"((***** <== 	*GOO!(K-94?$+B``       BI 677 	*&-((({&v{+++++ '&-- 	*K""$$$M$$S)))))	* 	*r)   N)rc   rd   re   r   rh   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   _can_record_outputsr0  rj   r)   r'   r  r    s}          &*#    N"& ,% 
,* ,* ,* ,* ,*r)   r  c                   r     e Zd ZdZdef fdZe	 d	deej	                 de
e         defd            Z xZS )
SiglipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    r   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rj   )r  )ry   r   r   s     r'   
<listcomp>z*SiglipEncoder.__init__.<locals>.<listcomp>"  s"    $i$i$iA%7%?%?$i$i$ir)   F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   s    `r'   r   zSiglipEncoder.__init__  s`    m$i$i$i$ivOgIhIh$i$i$ijj&+###r)   Nr   r   r?   c                 N    |}| j         D ]} |||fi |}t          |          S )N)r`   )rA  r   )r{   r   r   r   ra   encoder_layers         r'   r   zSiglipEncoder.forward&  sU     &![ 	 	M)M   MM ????r)   r   )rc   rd   re   rf   r   r   r   r   rA   r   r   r   r   r   r   r   s   @r'   r:  r:    s         ,| , , , , , ,  26@ @ !.@ +,	@
 
@ @ @ ^@ @ @ @ @r)   r:  c                        e Zd Zdef fdZee	 	 	 d
deej	                 deej	                 deej	                 de
e         def
d	                        Z xZS )SiglipTextTransformerr   c                 2   t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        t          j        ||j                  | _        d S r
  )r   r   r   r   r   r   r:  encoderr   r  r  final_layer_normr   projection_sizeheadr   s      r'   r   zSiglipTextTransformer.__init__9  sz    &	.v66$V,, "YF<Q R R RIi)?@@			r)   Nr   r   r   r   r?   c                    |t          d          |                                }|                    d|d                   }|                     ||          }d| j        j        v }|rd }n||st          ||j                  } | j        d||d|}|j	        }	| 
                    |	          }	|	d d dd d f         }
|                     |
          }
t          |	|
          S )NzYou have to specify input_idsr   )r   r   flash)r   r   r`   pooler_outputrj   )rM   r   r   r   r   r   r   r   rH  r`   rI  rK  r   )r{   r   r   r   r   input_shapera   uses_flash_attentionencoder_outputsr`   pooled_outputs              r'   r   zSiglipTextTransformer.forwardC  s)    <===nn&&NN2{277	),WW  '$+*JJ 	]!NN'0D'7H[\\N+74< ,
'),
 ,
 ,
 ,
 ,= 112CDD *!!!R(3		-00)/'
 
 
 	
r)   r   )rc   rd   re   r   r   r   r   r   rA   r   r   r   r   r   r   r   s   @r'   rF  rF  8  s        A/ A A A A A A  -115/3	(
 (
EL)(
 !.(
 u|,	(

 +,(
 
$(
 (
 (
 ^ (
 (
 (
 (
 (
r)   rF  zK
    The text model from SigLIP without any head or projection on top.
    c                        e Zd ZU eed<   def fdZdej        fdZd Z	 e
d          e	 	 	 dd	eej                 d
eej                 deej                 dee         def
d                        Z xZS )SiglipTextModelr   c                     t                                          |           t          |          | _        |                                  d S r   )r   r   rF  
text_model	post_initr   s     r'   r   zSiglipTextModel.__init__x  s@       /77r)   r?   c                 $    | j         j        j        S r   rW  r   r   r~   s    r'   get_input_embeddingsz$SiglipTextModel.get_input_embeddings~  s    )99r)   c                 (    || j         j        _        d S r   rZ  )r{   r   s     r'   set_input_embeddingsz$SiglipTextModel.set_input_embeddings  s    5:"222r)   Ftie_last_hidden_statesNr   r   r   r   c                 $     | j         d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, SiglipTextModel

        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   rj   )rW  )r{   r   r   r   r   s        r'   r   zSiglipTextModel.forward  s7    4 t 
)%
 
 	
 
 	
r)   r   )rc   rd   re   r   rh   r   r   Moduler[  r]  r   r   r   rA   r   r   r   r   r   r   r   s   @r'   rU  rU  p  s         /      :bi : : : :; ; ; u555 -115/3	
 
EL)
 !.
 u|,	

 +,
 
$
 
 
 ^ 65
 
 
 
 
r)   rU  c                   d     e Zd Zdef fdZe	 ddee         dee	         de
fd            Z xZS )	SiglipVisionTransformerr   c                 j   t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        t          |d          sdn|j        | _        | j        rt          |          | _        d S d S )Nr  vision_use_headT)r   r   r   r   r   r   r:  rH  r   r  r  post_layernormhasattrrf  use_headr  rK  r   s      r'   r   z SiglipVisionTransformer.__init__  s    &	088$V,, l9&:OPPP$+F4E$F$FbFLb= 	D;FCCDIII	D 	Dr)   Fr   r   r?   c                     |                      ||          } | j        dd|i|}|j        }|                     |          }| j        r|                     |          nd }t          ||          S )N)r   r   rN  rj   )r   rH  r`   rg  ri  rK  r   )r{   r   r   r   ra   rR  r`   rO  s           r'   r   zSiglipVisionTransformer.forward  s     Oghh+74< ,
 ,
',
,
 ,

 ,= //0ABB8<O		"34444)/'
 
 
 	
r)   r   )rc   rd   re   r   r   r   r   boolr   r   r   r   r   r   s   @r'   rd  rd    s        
D1 
D 
D 
D 
D 
D 
D  49
 
 #+4.
 +,	

 
$
 
 
 ^
 
 
 
 
r)   rd  c                   .     e Zd ZdZdef fdZd Z xZS )r  zMultihead Attention Pooling.r   c                    t                                                       t          j        t	          j        dd|j                            | _        t          j                            |j        |j	        d          | _
        t          j        |j        |j                  | _        t          |          | _        d S )Nr   T)batch_firstr  )r   r   r   	ParameterrA   randnr   r!  MultiheadAttentionr   r#  r  r  	layernormr   r  r   s     r'   r   z,SiglipMultiheadAttentionPoolingHead.__init__  s    \%+aF4F"G"GHH
44V5GIcqu4vvf&8f>STTTV$$r)   c                    |j         d         }| j                            |dd          }|                     |||          d         }|}|                     |          }||                     |          z   }|d d df         S )Nr   r   )r   r!  repeatr#  rr  r  )r{   hidden_stater   r!  r  s        r'   r   z+SiglipMultiheadAttentionPoolingHead.forward  s    !'*

!!*a33~~e\<HHK~~l33$((<"8"88AAAqD!!r)   )rc   rd   re   rf   r   r   r   r   r   s   @r'   r  r    sZ        &&%1 % % % % % %
" 
" 
" 
" 
" 
" 
"r)   r  zM
    The vision model from SigLIP without any head or projection on top.
    c            	            e Zd ZU eed<   dZdef fdZdej        fdZ	 e
d          e	 dded	ee         defd
                        Z xZS )SiglipVisionModelr   r   c                     t                                          |           t          |          | _        |                                  d S r   )r   r   rd  vision_modelrX  r   s     r'   r   zSiglipVisionModel.__init__  sC       3F;; 	r)   r?   c                 $    | j         j        j        S r   )ry  r   r   r~   s    r'   r[  z&SiglipVisionModel.get_input_embeddings  s     +;;r)   Fr^  r   r   c                 "     | j         d||d|S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, SiglipVisionModel

        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   r   rj   )ry  )r{   r   r   r   s       r'   r   zSiglipVisionModel.forward  s5    : !t  
%%=
 
 
 
 	
r)   r   )rc   rd   re   r   rh   main_input_namer   r   rb  r[  r   r   rk  r   r   r   r   r   r   s   @r'   rw  rw    s          $O1      <bi < < < < u555 */
 
 #'
 +,	

 
$
 
 
 ^ 65
 
 
 
 
r)   rw  c                       e Zd ZU eed<   def fdZ e            e	 	 ddej	        de
ej	                 de
ej	                 dej        fd                        Z e            e	 dd
ej        dedee         dej        fd                        Zee	 	 	 	 	 	 dde
ej                 d
e
ej                 de
ej	                 de
ej                 de
e         dedee         defd                        Z xZS )r&  r   c                    t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }t          
                    |          }t          
                    |          }|j        | _        |j        | _        t          j        t!          j        d                    | _        t          j        t!          j        d                    | _        |                                  d S )NzMconfig.text_config is expected to be of type SiglipTextConfig but is of type .zQconfig.vision_config is expected to be of type SiglipVisionConfig but is of type r   )r   r   r  text_configr   	TypeErrortyper  r   rU  _from_configrw  rW  ry  r   ro  rA   rp  r(  r*  rX  )r{   r   r  r  rW  ry  r   s         r'   r   zSiglipModel.__init__   sM      &,.>?? 	0+,,0 0 0  
 &.0BCC 	2-..2 2 2  
 (, %11+>>
(55mDD %/(5<A77,u{1~~66 	r)   Nr   r   r   r?   c                 D    |                      |||          }|j        }|S )aJ  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```ra  )rW  rO  )r{   r   r   r   text_outputsrS  s         r'   get_text_featureszSiglipModel.get_text_features@  s6    6 48??)% 4C 4
 4

 %2r)   Fr   r   r   c                 4     | j         d||d|}|j        }|S )ah  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```r|  rj   )ry  rO  )r{   r   r   r   vision_outputsrS  s         r'   get_image_featureszSiglipModel.get_image_featuresd  sC    > 6GT5F 6
%%=6
 6
 6
 6

 '4r)   return_lossc           	          | j         d||d|} | j        d|||d|}	|j        }
|	j        }|
|
                    ddd          z  }
||                    ddd          z  }t	          j        ||
                                                    |j                            }| j	                            |j                  | j
                            |j                  }}||                                z  |z   }|                                }d}|rt	          j        |                    d          |j        	          }t	          j        |           d|z  z   }t          j        j                            ||z            }t	          j        |d
           }|                                }t)          |||||
|	|          S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```r|  ra  r*   r   T)r   r   keepdimNr   )devicer   )rp   rq   rr   rm   r_   rs   rt   rj   )ry  rW  rO  normrA   r   tr   r  r(  r*  expeyer   	ones_liker   r   
logsigmoidsumr6   ro   )r{   r   r   r   r   r  r   r   r  r  r_   rm   rr   r(  r*  rq   rp   r  m1_diag1logliknlls                        r'   r   zSiglipModel.forward  s   T 6GT5F 6
%%=6
 6
 6
 6
 4C4? 4
)%4
 4
 	4
 4
 &3"0 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO  ,{LNN4D4D4G4GHZ4[4[\\"&"2"5"5k6H"I"I4?K]K]^i^pKqKqZ)KOO,=,==
J*,,.. 	)O0033O<RSSSC8881s7BHX(33H4NOOF9V,,,,C88::D-+#%* .
 
 
 	
r)   )NNr   )NNNNNF)rc   rd   re   r   rh   r   r   r   rA   r   r   rg   r  rk  r   r   r  r   r   ro   r   r   r   s   @r'   r&  r&    s        |      @ %$&& 26/3	   <  !.  u|,	 
 
	      ^ '& D %$&& */$ $'$ #'$ +,	$
 
	$ $ $ ^ '&$N  15481537&*).U
 U
E,-U
 u01U
 !.	U

 u/0U
 d^U
 #'U
 +,U
 
U
 U
 U
 ^ U
 U
 U
 U
 U
r)   r&  z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdeddf fdZee	 	 	 ddee	j
                 dee	j
                 ded	ee         def
d
                        Z xZS )r,  r   r   r?   Nc                 n   t                                          |           |j        | _        t                              |j                  }|j        | _        |j        dk    r$t          j        |j        j	        |j                  nt          j
                    | _        |                                  d S )Nr   )r   r   
num_labelsrw  r  r  ry  r   r   r   Identityr-  rX  )r{   r   ry  r   s      r'   r   z%SiglipForImageClassification.__init__  s        + )55f6JKK(5 OUN_bcNcNcBIf*68IJJJikitiviv 	
 	r)   Flabelsr   r   c                      | j         |fd|i|}|j        }t          j        |d          }|                     |          }d}||                     ||| j                  }t          ||          S )a$  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a `SiglipModel` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```r   r   r  N)rp   logits)ry  r`   rA   r6   r-  loss_functionr   r   )	r{   r   r  r   r   outputssequence_outputr  rp   s	            r'   r   z$SiglipForImageClassification.forward  s    P /@d.?/
 /
%=/
 /
 /
 "3  *_!<<<11%%ffdkBBD$
 
 
 	
r)   )NNF)rc   rd   re   r}  r   r   r   r   r   rA   r   rk  r   r   r   r   r   r   s   @r'   r,  r,    s         %O|       $  04)-).	:
 :
u|,:
 &:
 #'	:

 +,:
 
:
 :
 :
 ^ :
 :
 :
 :
 :
r)   r,  )r&  r  rU  rw  r,  )r=   r!   r>   r"   )r!   rD   rE   )r=   )Grf   r#   r.   dataclassesr   typingr   r   r   r   numpyr  rA   r   torch.nn.initr	   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_siglipr   r   r   r<   r   floatrC   rT   rY   r[   r^   rl   ro   rb  r   r   r   r   r   r  r  r:  rF  rU  rd  r  rw  r&  r,  __all__rj   r)   r'   <module>r     s       ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1            7 7 7 7 7 7 ! ! ! ! ! ! B B B B B B 9 9 9 9 9 9 b b b b b b b b b b F F F F F F F F & & & & & &                0 / / / / / T T T T T T T T T T!  !  ! J \_$ $L$ %$27$BG$SX$
\$ $ $ $4A A A A2N N ND D D   	? 	? 	? 	? 	?k 	? 	?  	?   	? 	? 	? 	? 	?K 	? 	?  	?  
  
  
  
  
;  
  
   
FE E E E ERY E E ER% % % % %29 % % %^ % %I%<% 
% <	%
 U\*% % % % % %.;) ;) ;) ;) ;)bi ;) ;) ;)~    	       3   D A* A* A* A* A*O A* A* A*J@ @ @ @ @BI @ @ @D5
 5
 5
 5
 5
BI 5
 5
 5
p   
.
 .
 .
 .
 .
+ .
 .
 
.
b#
 #
 #
 #
 #
bi #
 #
 #
L" " " " "") " " "0   
0
 0
 0
 0
 0
- 0
 0
 
0
f G
 G
 G
 G
 G
' G
 G
 G
T   Q
 Q
 Q
 Q
 Q
#8 Q
 Q
 Q
h  r)   