
     `i                     4   d Z ddlmZ ddlmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZmZmZmZmZ ddlm Z m!Z!m"Z"  ej#        e$          Z%dej&        dej&        fdZ'dej&        dej&        fdZ(dej&        dej&        fdZ)e ed           G d de                                  Z*e ed           G d de                                  Z+ee G d de                                  Z, G d d e	j-                  Z. G d! d"e	j-                  Z/	 	 dLd%e	j-        d&ej&        d'ej&        d(ej&        d)eej&                 d*e0d+e0d,e1fd-Z2 G d. d/e	j-                  Z3 G d0 d1e	j-                  Z4 G d2 d3e          Z5e G d4 d5e                      Z6 G d6 d7e	j-                  Z7 G d8 d9e	j-                  Z8 ed:           G d; d<e6                      Z9 G d= d>e	j-                  Z: ed?           G d@ dAe6                      Z;e G dB dCe6                      Z<e G dD dEe6                      Z=e G dF dGe6                      Z> edH           G dI dJe6                      Z?g dKZ@dS )MzPyTorch CLIP model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     t           j                            | t          j        t          |           | j                            S )Ndevice)r   
functionalcross_entropytorcharangelenr!   )r   s    z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr(   %   s3    =&&vu|CKKPVP]/^/^/^___    
similarityc                 r    t          |           }t          |                                           }||z   dz  S )Ng       @)r(   t)r*   caption_loss
image_losss      r'   	clip_lossr/   )   s4    #J//L!*,,..11J:%,,r)   tensorc                     t          j        | d          }t          j        |dd          }t          j        |d          }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r$   powsum)r0   square_tensor
sum_tensornormed_tensors       r'   _get_vector_normr<   /   sB    
 Ifa((M=b$???JIj#..Mr)   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r@   r   r$   FloatTensor__annotations__rA   rB   tuplerC    r)   r'   r?   r?   :   s          
 15L(5,-44459x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r)   r?   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsrA   .rB   rC   )rD   rE   rF   rG   rN   r   r$   rH   rI   rA   rB   rJ   rC   rK   r)   r'   rM   rM   L   s          
 04K%+,33359x 12999=AM8E%"3S"89:AAA:>Ju0#567>>>>>r)   rM   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeed<   dZeed	<   d
ee         fdZdS )
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrN   r@   text_model_outputvision_model_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))rT   rU   N)getattrto_tuple).0kselfs     r'   	<genexpr>z&CLIPOutput.to_tuple.<locals>.<genexpr>}   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r)   )rJ   keysr\   s   `r'   rY   zCLIPOutput.to_tuple|   sC     
 
 
 
YY[[
 
 
 
 
 	
r)   )rD   rE   rF   rG   rQ   r   r$   rH   rI   rR   rS   rN   r@   rT   r   rU   rJ   r   rY   rK   r)   r'   rP   rP   ^   s          & )-D(5$
%,,,48hu0188837OXe/0777/3K%+,33304L(5,-4444818886:3:::
%* 
 
 
 
 
 
r)   rP   c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )CLIPVisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebiasr2   r   position_idsr   r3   
persistent)super__init__rb   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandr\   rb   	__class__s     r'   rn   zCLIPVisionEmbeddings.__init__   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr)   
embeddingsheightwidthr   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr3   r6   r	   r2   bicubicF)sizemodealign_cornersr4   )shaper|   weight	unsqueezer$   jit
is_tracingri   rr   r   reshapepermuter   r"   interpolateviewcat)r\   r   r   r   ry   r|   rz   class_pos_embedpatch_pos_embedr4   
new_height	new_widthsqrt_num_positionss                r'   interpolate_pos_encodingz-CLIPVisionEmbeddings.interpolate_pos_encoding   s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr)   Fpixel_valuesc                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper2   r   r3   r   )r   rq   
ValueErrorrx   r   r   toflatten	transposeru   r~   r$   r   r   r|   ri   )r\   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r'   forwardzCLIPVisionEmbeddings.forward   sD   '3'9$
Avu' 	Vt-F-F%SWSbJbJbqVqqeqqDOqq^b^mqqq   +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr)   F)rD   rE   rF   r   rn   r$   Tensorintr   rH   r   __classcell__r   s   @r'   ra   ra      s        q/ q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Z_Zf        r)   ra   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
CLIPTextEmbeddingsrb   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nri   rj   Frk   )rm   rn   ro   r   r{   
vocab_sizetoken_embeddingmax_position_embeddingsr|   r}   r$   r%   r~   r\   rb   rp   r   s      r'   rn   zCLIPTextEmbeddings.__init__   s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r)   N	input_idsri   inputs_embedsr   c                 .   ||j         d         n|j         d         }| j        j        j         d         }||k    rt          d| d|           || j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nr3   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r|   r   r   ri   r   )r\   r   ri   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r'   r   zCLIPTextEmbeddings.forward   s     -6,AY_R((}GZ[]G^
!%!8!?!Ea!H...VV V=SV V  
 ,QQQ^<L  00;;M"55lCC"%88
r)   )NNN)rD   rE   rF   r   rn   r   r$   
LongTensorrH   r   r   r   r   s   @r'   r   r      s        

~ 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r)   r           Tmodulequerykeyvalueattention_maskscalingdropoutoutput_attentionsc                    t          j        ||                    dd                    |z  }	||	|z   }	t          j                            |	dt           j                                      |j                  }	t          j        	                    |	|| j
                  }	t          j        |	|          }
|
                    dd                                          }
|sd }	|
|	fS )Nr3   r   )r4   r   )ptrainingr   r2   )r$   matmulr   r   r"   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r'   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K $$r)   c                        e Zd ZdZdeeef         f fdZ	 	 	 ddej	        de
ej	                 de
ej	                 d	e
e         d
eej	        e
ej	                 f         f
dZ xZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrb   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rm   rn   rb   ro   rp   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r'   rn   zCLIPAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr)   NFrB   r   causal_attention_maskr   r   c                 R   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||d| j                                      dd          }|	                    ||d| j                                      dd          }	|
                    ||d| j                                      dd          }
| j        j        dk    r
|du| _	        n||||z   }n||}t          }| j        j        dk    rt          | j        j                 } || ||	|
|| j	        | j        | j        sdn| j        |	  	        \  }}|                    |||                                          }|                     |          }|sd}||fS )	z#Input shape: Batch x Time x Channelr3   r   r2   flash_attention_2Neagerr   )r   r   r   r   )r   r   r   r   r   r   r   rb   _attn_implementationr   r   r   r   r   r   r   r   r   )r\   rB   r   r   r   r   r   rp   queriesr^   valuesattention_interfacer   r   s                 r'   r   zCLIPAttention.forward-  s    -:,?)
J	++m,,{{=))]++,,z:r4=IISSTUWXYYyyZT]CCMMaQRSSZRGGQQRSUVWW ;+/BBB2$>DNN).C.O!/2G!G&2!6(?;+w66"9$+:Z"[$7$7nJ#}>CC$,/
%
 
%
 
%
!\ "))*j)LLWWYYmmK00  	 LL((r)   )NNF)rD   rE   rF   rG   r   r   r   rn   r$   r   r   boolrJ   r   r   r   s   @r'   r   r     s        GGBu%5~%EF B B B B B B. 268<,11) 1)|1) !.1)  (5	1)
 $D>1) 
u|Xel33	41) 1) 1) 1) 1) 1) 1) 1)r)   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )CLIPMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)rm   rn   rb   r
   
hidden_actactivation_fnr   r   ro   intermediate_sizefc1fc2r   s     r'   rn   zCLIPMLP.__init__b  sf    #F$569V/1IJJ9V5v7IJJr)   rB   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )r\   rB   s     r'   r   zCLIPMLP.forwardi  s=    //**=99//r)   )rD   rE   rF   rn   r$   r   r   r   r   s   @r'   r   r   a  sc        K K K K KU\ el        r)   r   c                        e Zd Zdeeef         f fdZ	 d
dej        dej        dej        de	e
         deej                 f
d	Z xZS )CLIPEncoderLayerrb   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)eps)rm   rn   ro   rp   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r'   rn   zCLIPEncoderLayer.__init__q  s}    +&v..<F<QRRR6??<F<QRRRr)   FrB   r   r   r   r   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rB   r   r   r   )r   r   r   r   )r\   rB   r   r   r   residualr   outputss           r'   r   zCLIPEncoderLayer.forwardy  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr)   r   )rD   rE   rF   r   r   r   rn   r$   r   r   r   rJ   rH   r   r   r   s   @r'   r   r   p  s        Su%5~%EF S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r)   r   c                   8    e Zd ZU eed<   dZdZdZdZdZ	dZ
d ZdS )CLIPPreTrainedModelrb   clipTc                 d
   | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             n t          |t                    r| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j        j        |j         j        |z             t          j                            |j        j        |j         j        |z             n[t          |t                     r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           n_t          |t,                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           nt          |t4                    rt          j                            |j        j        |j        dz  | j         j        z             t          j                            |j        j        |j        dz  | j         j        z             nt          |t>                    rFt          j                            |j        j        | j         j        dz  | j         j        z             nt          |t@                    rFt          j                            |j        j        | j         j        dz  | j         j        z             n_t          |tB                    rJt          j                            |j"        j        | j         j#        j        dz  | j         j        z             t          |t          j$                  r=|j%        j        &                                 |j        j        '                    d           t          |t          j(                  r'|j%        "|j%        j        &                                 dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r  r2   g      ?N))rb   initializer_factor
isinstancer   r   r   datanormal_r|   ra   r   initru   rp   rx   initializer_ranger   num_hidden_layersr   r   r   r   r   ro   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   rh   zero_fill_r   )r\   r   factorin_proj_stdout_proj_stdfc_stds         r'   _init_weightsz!CLIPPreTrainedModel._init_weights  s   /f011 -	").66CVd]6SSS%,199sQU9VVVV 455 *	[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhGOOF5<&-BadjBjOkkkk.. %	[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEE(( 	[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O????	** 	GOO&-)4/$+2PP     GOO(/+T1DK4RR       =>> 	GOO(/K+T1DK4RR       ;<< 		GOO&-K+T1DK4RR       :;; 	GOO!(K-94?$+B``    
 fbl++ 	*K""$$$M$$S)))fbi(( 	%V[-DK""$$$$$	% 	%-D-Dr)   N)rD   rE   rF   r   rI   base_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr  rK   r)   r'   r  r    sT         &*#N"&6% 6% 6% 6% 6%r)   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddeej                 deej                 dee	         dee	         d	e
f
d
Z xZS )CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rb   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rK   )r   )rZ   r   rb   s     r'   
<listcomp>z(CLIPEncoder.__init__.<locals>.<listcomp>  s"    $g$g$g!%5f%=%=$g$g$gr)   F)	rm   rn   rb   r   
ModuleListranger  layersgradient_checkpointingr   s    `r'   rn   zCLIPEncoder.__init__  s`    m$g$g$g$guVMeGfGf$g$g$ghh&+###r)   Nr   r   r   output_hidden_statesr   c                     ||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}t          | j                  D ]2\  }	}
|r||fz   } |
||||          }|d         }|r||d         fz   }3|r||fz   }t          |||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrK   )r   r   r   )rA   rB   rC   )rb   r   r.  	enumerater,  r   )r\   r   r   r   r   r.  encoder_statesall_attentionsrB   idxencoder_layerlayer_outputss               r'   r   zCLIPEncoder.forward  s   J 2C1N--TXT_Tq$8$D  $+Jj 	  4=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N+(%
 
 
 	
r)   NNNN)rD   rE   rF   rG   r   rn   r   r$   r   r   r   r   r   r   s   @r'   r&  r&    s         ,z , , , , , , 268<,0/3D
 D
 !.D
  (5	D

 $D>D
 'tnD
 
D
 D
 D
 D
 D
 D
 D
 D
r)   r&  c                        e Zd Zdef fdZe	 	 	 	 	 ddeej                 deej                 deej                 dee	         dee	         d	e
fd
            Z xZS )CLIPTextTransformerrb   c                    t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |j        | _        d S r   )rm   rn   rb   ro   r   r   r&  encoderr   r   r   final_layer_normeos_token_idr   s      r'   rn   zCLIPTextTransformer.__init__<  ss    &	,V44"6** "YF<Q R R R #/r)   Nr   r   ri   r   r.  r   c                    ||n| j         j        }||n| j         j        }|t          d          |                                }|                    d|d                   }|                     ||          }t          ||j        |j	                  }|%| j         j
        dk    rt          ||j                  }|                     |||||          }	|	j        }
|                     |
          }
| j        dk    rg|
t!          j        |
j        d         |
j	                  |                    t           j        |
j	        	                              d
          f         }n|
t!          j        |
j        d         |
j	                  |                    t           j        |
j	        	          | j        k                                                        d
          f         }t-          |
||	j        |	j                  S )NzYou have to specify input_idsr3   )r   ri   r    r   )r   r   r   r   r.  r2   r   )r   r!   r   rA   pooler_outputrB   rC   )rb   r   r.  r   r   r   r   r   r   r!   r   r   r:  rA   r;  r<  r$   r%   r   r   r   argmaxr   rB   rC   )r\   r   r   ri   r   r.  input_shaperB   r   encoder_outputsrA   pooled_outputs               r'   r   zCLIPTextTransformer.forwardG  s    2C1N--TXT_Tq$8$D  $+Jj 	 <===nn&&NN2{277	),WW !A,]5I!
 !
 !

 %$+*JNa*a*a7H[\\N+/<<')"7/!5 ,8 ,
 ,
 ,= 112CDD!! ..4Q7@Q@XYYY595F5MNNUUZ\U]]_MM ..4Q7@Q@XYYY EI6G6NOOSWSddB!M */')7&1	
 
 
 	
r)   NNNNN)rD   rE   rF   r   rn   r   r   r$   r   r   r   r   r   r   s   @r'   r8  r8  ;  s        	0~ 	0 	0 	0 	0 	0 	0  -115/3,0/3F
 F
EL)F
 !.F
 u|,	F

 $D>F
 'tnF
 
$F
 F
 F
 ^F
 F
 F
 F
 F
r)   r8  zI
    The text model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   ddgZdZdef fdZdej	        fdZ
d Zee	 	 	 	 	 dd
eej                 deej                 deej                 dee         dee         defd                        Z xZS )CLIPTextModelrb   r   r   Fc                     t                                          |           t          |          | _        |                                  d S r   )rm   rn   r8  
text_model	post_initr   s     r'   rn   zCLIPTextModel.__init__  s@       -f55r)   r   c                 $    | j         j        j        S r   rH  r   r   r_   s    r'   get_input_embeddingsz"CLIPTextModel.get_input_embeddings      )99r)   c                 (    || j         j        _        d S r   rK  r\   r   s     r'   set_input_embeddingsz"CLIPTextModel.set_input_embeddings      5:"222r)   Nr   r   ri   r   r.  c                 6    |                      |||||          S )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   ri   r   r.  )rH  )r\   r   r   ri   r   r.  s         r'   r   zCLIPTextModel.forward  s.    4 )%/!5  
 
 	
r)   rD  )rD   rE   rF   r   rI   _no_split_modulesr"  rn   r   ModulerL  rP  r   r   r   r$   r   r   r   r   r   r   s   @r'   rF  rF    s*         -/AB ~      :bi : : : :; ; ;  -115/3,0/3
 
EL)
 !.
 u|,	

 $D>
 'tn
 
$
 
 
 ^ 
 
 
 
 
r)   rF  c                        e Zd Zdef fdZe	 	 	 	 ddeej                 dee	         dee	         dee	         d	e
f
d
            Z xZS )CLIPVisionTransformerrb   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )rm   rn   rb   ro   ra   r   r   r   r   pre_layrnormr&  r:  post_layernormr   s      r'   rn   zCLIPVisionTransformer.__init__  s    &	.v66L8MNNN"6** l9&:OPPPr)   NFr   r   r.  r   r   c                 |   ||n| j         j        }||n| j         j        }|t          d          |                     ||          }|                     |          }|                     |||          }|j        }|d d dd d f         }|                     |          }t          |||j
        |j                  S )Nz You have to specify pixel_values)r   )r   r   r.  r   r>  )rb   r   r.  r   r   rY  r:  rA   rZ  r   rB   rC   )	r\   r   r   r.  r   rB   rB  rA   rC  s	            r'   r   zCLIPVisionTransformer.forward  s     2C1N--TXT_Tq$8$D  $+Jj 	 ?@@@Oghh))-88+/<<'/!5 ,8 ,
 ,
 ,=)!!!Q'2++M::)/')7&1	
 
 
 	
r)   NNNF)rD   rE   rF   r   rn   r   r   r$   rH   r   r   r   r   r   s   @r'   rW  rW    s        Q/ Q Q Q Q Q Q  59,0/338!
 !
u01!
 $D>!
 'tn	!

 #+4.!
 
$!
 !
 !
 ^!
 !
 !
 !
 !
r)   rW  zK
    The vision model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZdgZdef fdZdej	        fdZ
ee	 	 	 	 ddeej                 d	ee         d
ee         dedef
d                        Z xZS )CLIPVisionModelrb   r   r   c                     t                                          |           t          |          | _        |                                  d S r   )rm   rn   rW  vision_modelrI  r   s     r'   rn   zCLIPVisionModel.__init__  sA       1&99r)   r   c                 $    | j         j        j        S r   r`  r   rx   r_   s    r'   rL  z$CLIPVisionModel.get_input_embeddings       +;;r)   NFr   r.  r   c                 4    |                      ||||          S )a  
        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r.  r   )r`  )r\   r   r   r.  r   s        r'   r   zCLIPVisionModel.forward  s-    <   %/!5%=	 ! 
 
 	
r)   r\  )rD   rE   rF   r   rI   main_input_namerT  rn   r   rU  rL  r   r   r   r$   rH   r   r   r   r   r   s   @r'   r^  r^    s          $O+,/      <bi < < < <  59,0/3).!
 !
u01!
 $D>!
 'tn	!

 #'!
 
$!
 !
 !
 ^ !
 !
 !
 !
 !
r)   r^  c                        e Zd ZU eed<   g dZdZdef fdZ e            e		 	 dde
j        dee
j                 dee
j                 d	e
j        fd
                        Z e            e		 dde
j        ded	e
j        fd                        Zee		 	 	 	 	 	 	 	 ddee
j                 dee
j                 dee
j                 dee
j                 dee         dee         dee         ded	efd                        Z xZS )r  rb   )r   r   ra   Fc                 l   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }|j	        | _	        |j
        | _        |j
        | _        t                              |          }|j        | _        t                               |          }|j        | _        t%          j        | j        | j	        d          | _        t%          j        | j        | j	        d          | _        t%          j        t/          j        | j        j                            | _        |                                  d S )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Frh   )rm   rn   r  text_configr   	TypeErrortyper  r   projection_dimro   r  r  rF  _from_configrH  r^  r`  r   r   r  r  rs   r$   r0   rb   logit_scale_init_valuelogit_scalerI  )r\   rb   rk  r  rH  r`  r   s         r'   rn   zCLIPModel.__init__:  s      &,n== 	0+,,0 0 0  
 &.0@AA 	2-..2 2 2  
 (,$3)5 - 9"//<<
$/&33MBB(5!#4+@$BU\a!b!b!b!y)<d>QX]^^^<T[5W(X(XYY 	r)   Nr   r   ri   r   c                 n    |                      |||          }|j        }|                     |          }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r   r   ri   )rH  r?  r  )r\   r   r   ri   text_outputsrC  text_featuress          r'   get_text_featureszCLIPModel.get_text_features]  sI    6 48??)% 4C 4
 4

 %2,,];;r)   r   r   c                 l    |                      ||          }|j        }|                     |          }|S )ai  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   r   )r`  r?  r  )r\   r   r   vision_outputsrC  image_featuress         r'   get_image_featureszCLIPModel.get_image_features  sH    < 6:5F5F%%= 6G 6
 6
 '4//>>r)   return_lossr   r.  c	           	         ||n| j         j        }||n| j         j        }|                     ||||          }	|                     |||||          }
|	j        }|                     |          }|
j        }|                     |          }|t          |          z  }|t          |          z  }t          j
        ||                                                    |j                            }|| j                                                            |j                  z  }|                                }d}|rt!          |          }t#          ||||||
|	          S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nre  rS  )rQ   rR   rS   rN   r@   rT   rU   )rb   r   r.  r`  rH  r?  r  r  r<   r$   r   r,   r   r!   rq  expr/   rP   )r\   r   r   r   ri   rz  r   r.  r   rw  rs  r@   rN   rS   rR   rQ   s                   r'   r   zCLIPModel.forward  s   P 2C1N--TXT_Tq$8$D  $+Jj 	 6:5F5F%/!5%=	 6G 6
 6
 48??)%/!5 4C 4
 4
 &3--l;;"0**;77 $&6|&D&DD!$4[$A$AA  ,{LNN4D4D4G4GHZ4[4[\\)D,<,@,@,B,B,E,EkFX,Y,YY*,,.. 	._--D-+#%* .
 
 
 	
r)   )NNr   )NNNNNNNF)rD   rE   rF   r   rI   rT  r"  rn   r   r   r$   r   r   rH   ru  r   ry  r   r   rP   r   r   r   s   @r'   r  r  4  s         ZZZ !z ! ! ! ! ! !F %$&& 26/3	! !<! !.! u|,	!
 
	! ! ! ^ '&!F %$&& */# #'# #'# 
		# # # ^ '&#J  15481537&*,0/3).V
 V
E,-V
 u01V
 !.	V

 u/0V
 d^V
 $D>V
 'tnV
 #'V
 
V
 V
 V
 ^ V
 V
 V
 V
 V
r)   r  c                        e Zd ZU eed<   dZddgZdef fdZdej	        fdZ
d Zee	 	 	 	 	 dd
eej                 deej                 deej                 dee         dee         defd                        Z xZS )r  rb   Fr   r   c                 
   t                                          |           t                              |          }|j        | _        t          j        |j        |j        d          | _	        | 
                                 d S NFrj  )rm   rn   rF  ro  rH  r   r   ro   rn  r  rI  )r\   rb   rH  r   s      r'   rn   z$CLIPTextModelWithProjection.__init__  sp       "//77
$/!y);V=RY^___ 	r)   r   c                 $    | j         j        j        S r   rK  r_   s    r'   rL  z0CLIPTextModelWithProjection.get_input_embeddings  rM  r)   c                 (    || j         j        _        d S r   rK  rO  s     r'   rP  z0CLIPTextModelWithProjection.set_input_embeddings  rQ  r)   Nr   r   ri   r   r.  c                     |                      |||||          }|j        }|                     |          }t          ||j        |j        |j                  S )a@  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```rS  )rN   rA   rB   rC   )rH  r?  r  rM   rA   rB   rC   )	r\   r   r   ri   r   r.  rs  rC  rN   s	            r'   r   z#CLIPTextModelWithProjection.forward  su    6 48??)%/!5 4C 4
 4
 %2**=99"#*<&4#.	
 
 
 	
r)   rD  )rD   rE   rF   r   rI   r"  rT  rn   r   rU  rL  rP  r   r   r   r$   r   r   rM   r   r   r   s   @r'   r  r    s(         -/AB	~ 	 	 	 	 	 	:bi : : : :; ; ;  -115/3,0/3(
 (
EL)(
 !.(
 u|,	(

 $D>(
 'tn(
 
(
 (
 (
 ^ (
 (
 (
 (
 (
r)   r  c                        e Zd ZU eed<   dZdef fdZdej        fdZ	e
e	 	 	 	 ddeej                 dee         d	ee         d
edef
d                        Z xZS )r  rb   r   c                 
   t                                          |           t                              |          }|j        | _        t          j        |j        |j        d          | _	        | 
                                 d S r  )rm   rn   r^  ro  r`  r   r   ro   rn  r  rI  r\   rb   r`  r   s      r'   rn   z&CLIPVisionModelWithProjection.__init__N  sq       &33F;;(5!#6+=v?T[`!a!a!a 	r)   r   c                 $    | j         j        j        S r   rb  r_   s    r'   rL  z2CLIPVisionModelWithProjection.get_input_embeddingsY  rc  r)   NFr   r.  r   c                     |                      ||||          }|j        }|                     |          }t          ||j        |j        |j                  S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
        >>> from transformers.image_utils import load_image

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```re  )r@   rA   rB   rC   )r`  r?  r  r?   rA   rB   rC   )r\   r   r   r.  r   rw  rC  r@   s           r'   r   z%CLIPVisionModelWithProjection.forward\  st    < 6:5F5F%/!5%=	 6G 6
 6
 '4--m<<$%,>(6%0	
 
 
 	
r)   r\  )rD   rE   rF   r   rI   rf  rn   r   rU  rL  r   r   r   r$   rH   r   r?   r   r   r   s   @r'   r  r  I  s         $O	/ 	 	 	 	 	 	<bi < < < <  59,0/3).*
 *
u01*
 $D>*
 'tn	*

 #'*
 
*
 *
 *
 ^ *
 *
 *
 *
 *
r)   r  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdeddf fdZee	 	 	 	 d
dee	j
                 dee	j
                 dee         dee         def
d	                        Z xZS )r  r   rb   r   Nc                 n   t                                          |           |j        | _        t                              |j                  }|j        | _        |j        dk    r$t          j        |j        j	        |j                  nt          j
                    | _        |                                  d S )Nr   )rm   rn   
num_labelsr^  ro  r  r`  r   r   ro   Identityr  rI  r  s      r'   rn   z#CLIPForImageClassification.__init__  s        +&33F4HII(5 OUN_bcNcNcBIf*68IJJJikitiviv 	
 	r)   labelsr   r.  c                 n   ||n| j         j        }||n| j         j        }|                     |||          }|j        }t          j        |ddddddf         d          }|                     |          }d}||                     ||| j                   }t          |||j
        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r.  r   r   )rQ   r   rB   rC   )rb   r   r.  r`  rA   r$   r  r  loss_functionr   rB   rC   )	r\   r   r  r   r.  r   sequence_outputr   rQ   s	            r'   r   z"CLIPForImageClassification.forward  s     2C1N--TXT_Tq$8$D  $+Jj 	 /3.?.?/!5 /@ /
 /
 "3  *_QQQAAAX%>AFFF11%%ffdkBBD$!/)	
 
 
 	
r)   r6  )rD   rE   rF   rf  r   rn   r   r   r   r$   r   r   r   r   r   r   s   @r'   r  r    s         %Oz d        04)-,0/3(
 (
u|,(
 &(
 $D>	(

 'tn(
 
(
 (
 (
 ^ (
 (
 (
 (
 (
r)   r  )r  r  rF  r  r^  r  r  )r   T)ArG   dataclassesr   typingr   r   r   r   r$   r   activationsr
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   r   r   configuration_clipr   r   r   
get_loggerrD   loggerr   r(   r/   r<   r?   rM   rP   rU  ra   r   floatr   r   r   r   r   r  r&  r8  rF  rW  r^  r  r  r  r  __all__rK   r)   r'   <module>r     s     ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1        ! ! ! ! ! ! d d d d d d d d 9 9 9 9 9 9 b b b b b b b b b b F F F F F F F F w w w w w w w w w w w w w w w w L L L L L L L L L L 
	H	%	%
`U\ `el ` ` ` `-%, -5< - - - -U\ el       
	? 	? 	? 	? 	?K 	? 	?  	?   
	? 	? 	? 	? 	?+ 	? 	?  	?  
  
  
  
  
  
  
   
FP P P P P29 P P Pf% % % % % % % %^ "% %I%<% 
% <	%
 U\*% % % % % % %0H) H) H) H) H)BI H) H) H)V    bi   / / / / /1 / / /d ?% ?% ?% ?% ?%/ ?% ?% ?%DS
 S
 S
 S
 S
") S
 S
 S
lS
 S
 S
 S
 S
") S
 S
 S
l   
2
 2
 2
 2
 2
' 2
 2
 
2
j-
 -
 -
 -
 -
BI -
 -
 -
`   
1
 1
 1
 1
 1
) 1
 1
 
1
h L
 L
 L
 L
 L
# L
 L
 L
^ A
 A
 A
 A
 A
"5 A
 A
 A
H >
 >
 >
 >
 >
$7 >
 >
 >
B   <
 <
 <
 <
 <
!4 <
 <
 <
~  r)   