
     `i_                   |   d Z ddlmZ ddlZddlZddlmZ ddlm	Z	 ddl
ZddlZddlmZ ddlmZmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  e#j)        e*          Z+ e"            rB	 ddl,Z-e-j.        /                    dd          Z0nM# e1$ r e+2                    d           Y n1w xY w	 ddl,Z-e-j.        /                    dd          Z0n# e1$ r Y nw xY wdZ3dZ4dddedZ5dfdZ6dgdZ7dhd Z8didjd'Z9dkdld,Z:dmd0Z;e G d1 d2e                      Z< G d3 d4ej=        j>                  Z? G d5 d6ej=        j>                  Z@ G d7 d8ej=        j>                  ZA G d9 d:ej=        j>                  ZB G d; d<ej=        j>                  ZC G d= d>ej=        j>                  ZD G d? d@ej=        j>                  ZE G dA dBej=        j>                  ZF G dC dDeF          ZG G dE dFej=        j>                  ZH G dG dHej=        j>                  ZI G dI dJej=        j>                  ZJ G dK dLej=        j>                  ZK G dM dNej=        j>                  ZL G dO dPej=        j>                  ZMe G dQ dRej=        j>                              ZNe G dS dTej=        j>                              ZOe G dU dVej=        j>                              ZP G dW dXe          ZQdYZRdZZSd[ZTd\ZU G d] d^eQ          ZV G d_ d`eQ          ZW e eR           G da dbeQ                      ZXg dcZYdS )nzTF 2.0 GroupViT model.    )annotationsN)	dataclass)Any   )get_tf_activation)TFBaseModelOutputTFBaseModelOutputWithPooling)TFModelInputTypeTFPreTrainedModelget_initializerkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forward#is_tensorflow_probability_availableloggingreplace_return_docstrings   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfig              ?)locscalea  GroupViT models are not usable since `tensorflow_probability` can't be loaded. It seems you have `tensorflow_probability` installed with the wrong tensorflow version.Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability.znvidia/groupvit-gcc-yfccg    חmask	tf.Tensortgt_len
int | Nonec                    t          |           d         }||n|}t          j        d          }t          j        | |j                  } t          j        | ddddddf         dd|df          }||z
  t          z  S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    r   Nr   dtype)r   tfconstantcastr'   tileLARGE_NEGATIVE)r!   r#   src_lenone_cstexpanded_masks        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/groupvit/modeling_tf_groupvit.py_expand_maskr1   R   s     q!G ,gg'Gk#G74w}---DGDD$!12Q7A4FGGMm#~55    logitsreturnc           	         t           j                            t          j                            t          j        t          |           d                   | d                    S )Nr   T)y_truey_predfrom_logits)r(   mathreduce_meanr   metricssparse_categorical_crossentropyranger   )r3   s    r0   contrastive_lossr>   a   sT    7558Jv..q1226t 	6 	
 	
  r2   
similarityc                r    t          |           }t          t          j        |                     }||z   dz  S )Ng       @)r>   r(   	transpose)r?   caption_loss
image_losss      r0   groupvit_lossrD   j   s6    #J//L!",z":":;;J:%,,r2   dimintc                >   t          | |          }t          j        ||          }t          j        |t	          |           |         t          t          t	          |                               |         |j                  }|t          j        |          z
  |z   }|S )Ndepthaxisr'   )	r   r(   argmaxone_hotr   r=   lenr'   stop_gradient)r3   rE   y_softindexy_hardrets         r0   hard_softmaxrS   p   s    FC((FIfc""EZ  % 3z&))**++C0l  F 2#F++
+f
4CJr2   Ftaufloathardboolc                   t           j                            dd          }|                    t	          j        |           | j                  }| |z   |z  }t          ||          }|rt	          j        ||          }t	          j	        |t          |           |         t          t          t          |                               |         |j                  }|t	          j        |          z
  |z   }	n|}	|	S )Nr   r   r&   rH   )tfpdistributionsGumbelsampler(   shaper'   r   rK   rL   r   r=   rM   rN   )
r3   rU   rW   rE   gumbel_distgumbelsrO   rP   rQ   rR   s
             r0   gumbel_softmaxra      s    #**344K  &!1!1 FFG3&GGS))F 	&#&&V$$S) s:f--..//4,
 
 
 r'///&8 Jr2   
attentionsheightwidthalign_cornersc                   ||z  | j         d         z  dz  }||k    r=t          t          j        ||z                      }t	          |           d         |z  }n<t          t          j        ||z                      }t	          |           d         |z  }t	          |           d         }t	          |           d         }t          j        | ||||f          } t          j        | d          } |r0t
          j        j	        j
                            | ||fd|          } n$t
          j
                            | ||fd	          } t          j        | d
          } | S )a  
    Args:
        attentions (`tf.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `tf.Tensor`: resized attention map of shape [batch_size, groups, height, width]
       g      ?r   r   r   rg   r   r   permbilinear)sizemethodre   )rl   rm   )r   r   r   rg   )r^   rF   nproundr   r(   reshaperA   compatv1imageresize)	rb   rc   rd   re   r    
feat_widthfeat_height
batch_sizegroupss	            r0   resize_attention_mapry      sU    e^z/22s:E~~%%-0011
 ,,Q/:="(6E>2233
++A.+=
J''*J
##A&FJV[*(UVVJj|<<<J ZY\'..%'	 / 
 


 X__Zvuoj_YY
j|<<<Jr2   tuple[tf.Tensor]hw_shape
tuple[int]c                   g }d}| D ]h}t          j        |d          }||}nt          j        ||          }t          t          j        |d          g|R  }|                    |           i|d         }t          j        |          S )a(  
    Args:
        attentions (`tuple(tf.Tensor)`: tuple of attention maps returned by `TFGroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `tf.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rg   r   ri   rT   )r(   rA   matmulry   appendrN   )rb   r{   	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r0   get_grouping_from_attentionsr      s     IO  	' 	'
\*9===
"(OO iDDO+BLy,Y,Y,Ye\deee&&&& r]NN+++r2   c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	ded<   dZ
ded	<   dZd
ed<   dZd
ed<   ddZdS )TFGroupViTModelOutputa8  
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`tf.Tensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`TFGroupViTTextModel`].
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`TFGroupViTVisionModel`].
        text_model_output (`TFBaseModelOutputWithPooling`):
            The output of the [`TFGroupViTTextModel`].
        vision_model_output (`TFBaseModelOutputWithPooling`):
            The output of the [`TFGroupViTVisionModel`].
    Ntf.Tensor | Nonelosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedsr	   text_model_outputvision_model_outputr4   
tuple[Any]c                ^     t           fd                                 D                       S )Nc              3  t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r   r   N)getattrto_tuple).0kselfs     r0   	<genexpr>z1TFGroupViTModelOutput.to_tuple.<locals>.<genexpr>	  sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r2   )tuplekeysr   s   `r0   r   zTFGroupViTModelOutput.to_tuple  sC     
 
 
 
YY[[
 
 
 
 
 	
r2   )r4   r   )__name__
__module____qualname____doc__r   __annotations__r   r   r   r   r   r   r   r    r2   r0   r   r      s          B "D!!!!)-----(,O,,,,,00000$(K((((%)L))))6:::::8<<<<<
 
 
 
 
 
r2   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFGroupViTCrossAttentionLayerconfigr   c                L    t                      j        di | t          |d          | _        t          j                            |j        d          | _        t          |d          | _
        t          j                            |j        d          | _        || _        d S )Nattnnamenorm2epsilonr   mlp	norm_postr   )super__init__TFGroupViTAttentionr   r   layersLayerNormalizationlayer_norm_epsr   TFGroupViTMLPr   r   r   r   r   kwargs	__class__s      r0   r   z&TFGroupViTCrossAttentionLayer.__init__  s    ""6"""'V<<<	\44V=RY`4aa
 e44488AV]h8iir2   Fqueryr"   keytrainingrX   r4   c                    |}||                      ||          d         z   }||                     |                     |                    z   }|                     |          }|S )N)encoder_hidden_statesr   )r   r   r   r   )r   r   r   r   xs        r0   callz"TFGroupViTCrossAttentionLayer.call  s\    		%s	;;A>>A'''NN1r2   Nc                h   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j	        g           d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j
        j                  5  | j
                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j	        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   r   )builtr   r(   
name_scoper   r   buildr   r   hidden_sizer   r   r   input_shapes     r0   r   z#TFGroupViTCrossAttentionLayer.build  s   : 	F
4&&2ty~.. & &	%%%& & & & & & & & & & & & & & &4$''3tz// H H
  $dk.E!FGGGH H H H H H H H H H H H H H H4%%1tx}-- % %t$$$% % % % % % % % % % % % % % %4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L L L L 87sH    A''A+.A+!(CCCD66D:=D:0(F%%F),F)r   r   F)r   r"   r   r"   r   rX   r4   r"   Nr   r   r   r   r   r   __classcell__r   s   @r0   r   r     sr                 L L L L L L L Lr2   r   c                  <     e Zd Zd fdZdddZdddZddZ xZS )TFGroupViTAssignAttentionr   r   c                    t                      j        di | |j        dz  | _        t          j                            |j        d          | _        t          j                            |j        d          | _        t          j                            |j        d          | _	        t          j                            |j        d          | _
        |j        | _        || _        d S )N      q_projr   k_projv_projprojr   )r   r   r   r    r   r   Denser   r   r   r   
assign_epsr   r   s      r0   r   z"TFGroupViTAssignAttention.__init__2  s    ""6"""'-
l(();((KKl(();((KKl(();((KKL&&v'9&GG	 +r2   TFr   r"   gumbelrX   rW   r   r4   c                ~    |r|rt          |d|          }n%|rt          |d          }nt          |d          }|S )N)rE   rW   )rE   rJ   )ra   rS   r   )r   r   r   rW   r   s        r0   get_attnz"TFGroupViTAssignAttention.get_attn=  sZ     	5h 	5!$BT:::DD 5#Db111%d444r2   r   r   c                   |}|                      |          }|                     |          }|                     |          }t          j        ||d          | j        z  }|                     ||          }|                     ||dd          }|t          j                            |dd          | j	        z   z  }t          j        ||          }| 
                    |          }||fS )NTtranspose_b)r   F)r   r   rW   rT   rJ   keepdims)r   r   r   r(   r   r    r   r9   
reduce_sumr   r   )	r   r   r   r   valueraw_attnr   	soft_attnouts	            r0   r   zTFGroupViTAssignAttention.callH  s    E"" kk# E"" 9UCT:::TZG}}X}99MM(XeRWMXX	rw))$R$)GG$/YZie$$iinnI~r2   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j	        j                  5  | j	                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j
        j                  5  | j
                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           `t          j        | j        j                  5  | j                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   r   )r   r   r(   r   r   r   r   r   r   r   r   r   r   s     r0   r   zTFGroupViTAssignAttention.builda  s   : 	F
44((4t{/00 I I!!4t{/F"GHHHI I I I I I I I I I I I I I I44((4t{/00 I I!!4t{/F"GHHHI I I I I I I I I I I I I I I44((4t{/00 I I!!4t{/F"GHHHI I I I I I I I I I I I I I I4&&2ty~.. G G	tT[-D EFFFG G G G G G G G G G G G G G G G G G 32sH    (A44A8;A8.(C""C&)C&(EEE
(F??GGr   )TTF)
r   r"   r   rX   rW   rX   r   rX   r4   r"   r   )r   r"   r   r"   r   rX   r   )r   r   r   r   r   r   r   r   r   s   @r0   r   r   1  s        	 	 	 	 	 		 	 	 	 	    2G G G G G G G Gr2   r   c                  :     e Zd Zd fdZdd
ZdddZddZ xZS )TFGroupViTTokenAssignr   r   num_group_tokenrF   num_output_groupc                    t                      j        di | || _        t          j                            j        d          | _        t          j	        t          j        j                  rj	        nj	        j	        f}fd|D             \  }}t          |||d          | _        t          j                            j        d          | _        t          j                            j        d          | _        t#          d          | _        t'          d	          | _        t          j                            j        d
          | _        t-          j        |j        d          | _        | _        d S )Nnorm_tokensr   c                >    g | ]}t          |j        z            S r   )rF   r   )r   r   r   s     r0   
<listcomp>z2TFGroupViTTokenAssign.__init__.<locals>.<listcomp>~  s)    #Z#Z#ZACF,>(>$?$?#Z#Z#Zr2   	mlp_interr   norm_post_tokensnorm_xpre_assign_attnassign
norm_new_xmlp_channelsr   )r   r   r   r   r   r   r   r   
isinstanceassign_mlp_ratiocollectionsabcIterableTFGroupViTMixerMLPr   r   r   r   r   r   r   r   r   r   r   r   )	r   r   r   r   r   r   
tokens_dimchannels_dimr   s	    `      r0   r   zTFGroupViTTokenAssign.__init__t  s   ""6""" 0 <::6CX_l:mm &1;?3KLLDF##)6+BC 	
 $[#Z#Z#ZIY#Z#Z#Z 
L+FOZQahsttt % ? ?H]dv ? w wl55f>SZb5cc<VJ[\\\/XFFF,99&BW^j9kk)F&f6H~
 
 
 r2   group_tokensr"   r4   c                Z    |                      |          }|                     |          }|S )z
        Args:
            group_tokens (tf.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (tf.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )r   r   projected_group_tokenss      r0   project_group_tokenz)TFGroupViTTokenAssign.project_group_token  s1     "&!=!=!%!6!67M!N!N%%r2   Fimage_tokensr   rX   c                F   |                      |          }|                     |          }|                     |          }|                     ||          }|                     ||          \  }}||z  }||                     |                     |                    z   }||fS )z
        Args:
            image_tokens (`tf.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`tf.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r  r   r   r   r   )r   r  r   r   r  new_image_tokens	attentions          r0   r   zTFGroupViTTokenAssign.call  s     ''55{{<00!%!9!9,!G!G!%!5!56Ll![![&*kk2H,&W&W#)22+d.?.?P`@a@a.b.bb**r2   Nc                   | j         rd S d| _         t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j	        j                  5  | j	                            d            d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j
        j                  5  | j
                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           ]t          j        | j        j                  5  | j                            d d | j        j        g           d d d            n# 1 swxY w Y   t          | d	d           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )
NTr   r   r   r   r   r   r   r   )r   r   r(   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s     r0   r   zTFGroupViTTokenAssign.build  s=   : 	F
4--9t/455 N N &&dDK4K'LMMMN N N N N N N N N N N N N N N4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4+T22>t49:: S S%++T49P,QRRRS S S S S S S S S S S S S S S44((4t{/00 I I!!4t{/F"GHHHI I I I I I I I I I I I I I I4*D11=t3899 1 1$**40001 1 1 1 1 1 1 1 1 1 1 1 1 1 144((4t{/00 ( (!!$'''( ( ( ( ( ( ( ( ( ( ( ( ( ( (4t,,8t344 M M%%tT4;3J&KLLLM M M M M M M M M M M M M M M4..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:s    (A44A8;A8.CCC(EE
E=(F11F58F5+HHHI33I7:I7-(K!!K%(K%MM
M)r   r   r   rF   r   rF   )r   r"   r4   r"   r   )r  r"   r   r"   r   rX   r   )r   r   r   r   r  r   r   r   r   s   @r0   r   r   s  s~             0& & & &+ + + + +&. . . . . . . .r2   r   c                  8     e Zd ZdZd fdZ	 dddZddZ xZS )TFGroupViTPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r   r   c                .    t                      j        d	i | |j        |j        }}|j        }|j        | _        t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _
        || _        || _        t          j                            | j        ||dddt          | j        j                  dd	  	        | _        d S )
Nr   r   validchannels_lastTzeros
projection)	filterskernel_sizestridespaddingdata_formatuse_biaskernel_initializerbias_initializerr   r   )r   r   
image_size
patch_sizenum_channelsr   r   r   r   r   num_patchesr   r   r   Conv2Dr   initializer_ranger  )r   r   r   r  r  r  r  r   s          r0   r   z"TFGroupViTPatchEmbeddings.__init__  s!   ""6"""!'!2F4EJ
*!-#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$&(,--$"'.t{/LMM$ . 

 

r2   Fpixel_valuesr"   interpolate_pos_encodingrX   r   r4   c                .   t          |          \  }}}}t          j                    r|| j        k    rt	          d          |sgt          j                    rT|| j        d         k    s|| j        d         k    r2t	          d| d| d| j        d          d| j        d          d	          t          j        |d	          }|                     |          }|| j        d         z  || j        d         z  z  }	t          j	        |||	| j
        f
          }
|
S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().rh   ri   tensorr^   )r   r(   executing_eagerlyr  
ValueErrorr  rA   r  r  rp   r   )r   r  r  r   rw   r  rc   rd   r  r  
embeddingss              r0   r   zTFGroupViTPatchEmbeddings.call  sJ    3=\2J2J/
L&%!! 	ld6G&G&Gw   )	$&&	 4?1---$/!:L1L1LwVwwewwDO\]L^wwaeapqraswww   |L|DDD__\22
  22vQRAS7ST Zz*kSWSc9deee
r2   Nc                    | j         rd S d| _         t          | dd           \t          j        | j        j                  5  | j                            d d d | j        g           d d d            d S # 1 swxY w Y   d S d S )NTr  )r   r   r(   r   r  r   r   r  r   s     r0   r   zTFGroupViTPatchEmbeddings.build  s    : 	F
4t,,8t344 M M%%tT49J&KLLLM M M M M M M M M M M M M M M M M M 98s    $A11A58A5r   r   FFr  r"   r  rX   r   rX   r4   r"   r   )r   r   r   r   r   r   r   r   r   s   @r0   r	  r	    s         
 
 
 
 
 
: af         DM M M M M M M Mr2   r	  c                  @     e Zd ZdZd fdZddZdd	Z	 dddZ xZS )TFGroupViTVisionEmbeddingsz7
    Construct the position and patch embeddings.

    r   r   c                     t                      j        di | t          |d          | _        t          j                            |j        d          | _        t          j                            |j	        d          | _
        || _        d S )Npatch_embeddingsr   dropout)rater   	layernormr   r   )r   r   r	  r.  r   r   Dropoutr/  r   r   r1  r   r   s      r0   r   z#TFGroupViTVisionEmbeddings.__init__   s    ""6""" 9&GY Z Z Z|++i+PP88AV]h8iir2   Nc                   | j         j        }|                     d|| j        j        fddd          | _        | j        rd S d| _        t          | dd           Pt          j	        | j         j
                  5  | j                             d            d d d            n# 1 swxY w Y   t          | dd           Pt          j	        | j        j
                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           `t          j	        | j        j
                  5  | j                            d d | j        j        g           d d d            d S # 1 swxY w Y   d S d S )	Nr   r  Tposition_embeddingsr^   initializer	trainabler   r.  r/  r1  )r.  r  
add_weightr   r   r4  r   r   r(   r   r   r   r/  r1  )r   r   r  s      r0   r   z TFGroupViTVisionEmbeddings.build(  sH   +7#'??k4;#:;&	 $3 $
 $
  : 	F
4+T22>t49:: 2 2%++D1112 2 2 2 2 2 2 2 2 2 2 2 2 2 24D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4d++7t~233 L L$$dD$+2I%JKKKL L L L L L L L L L L L L L L L L L 87s6   7BB"%B"C??DD9(E..E25E2r4   r"   c                   t          |          \  }}}t          | j                  d         }||k    r||k    r| j        S | j        }|| j        j        z  }	|| j        j        z  }
t          j                            t	          j        |dt          t          j
        |                    t          t          j
        |                    |f          |	|
fd          }t	          j        |dd|f          }|S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   r^   bicubic)imagesrl   rm   rT   r"  )r   r4  r   r  r(   rs   rt   rp   rF   r9   sqrt)r   r&  rc   rd   rw   r  rE   num_positionspatch_pos_embedh0w0s              r0   r  z3TFGroupViTVisionEmbeddings.interpolate_pos_encoding>  s     (2*'='=$
K"4#;<<Q?-''FeOO++2t{--dk,,(//:3ty/G/G+H+H#diXeNfNfJgJgil'm   b * 
 
 *OAr3<PPPr2   Fr  r  rX   r   c                    t          |          \  }}}}|                     ||          }|                     |          }|r||                     |||          z   }n
|| j        z   }|                     |          }|S )N)r  )r   r.  r1  r  r4  r/  )r   r  r  r   _rc   rd   r&  s           r0   r   zTFGroupViTVisionEmbeddings.callY  s     )661fe**<Rj*kk
^^J//
 $ 	?#d&C&CJPVX]&^&^^JJ#d&>>J\\*--
r2   r   r   )r4   r"   r)  r*  )	r   r   r   r   r   r   r  r   r   r   s   @r0   r,  r,    s         
     L L L L,   8 af        r2   r,  c                  >     e Zd Zd fdZdd fdZ	 	 	 dddZ xZS )TFGroupViTTextEmbeddingsr   r   c                `     t                      j        di | |j        | _        || _        d S )Nr   )r   r   r   	embed_dimr   r   s      r0   r   z!TFGroupViTTextEmbeddings.__init__m  s5    ""6"""+r2   Nr   tf.TensorShapec                F   t          j        d          5  |                     | j        j        | j        ft          | j        j        | j        j        z            dd          | _	        d d d            n# 1 swxY w Y   t          j        d          5  |                     | j        j
        | j        ft          | j        j        | j        j        z            dd          | _        d d d            n# 1 swxY w Y   t                                          |           d S )Ntoken_embeddingTweightr5  position_embeddingr&  )r(   r   r8  r   
vocab_sizerG  r   initializer_factorr  rK  max_position_embeddingsrL  r   r   )r   r   r   s     r0   r   zTFGroupViTTextEmbeddings.buildt  s   ],-- 	 	//{-t~>+DK,JT[Mj,jkk	 *  DK	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ]/00 	 	&*oo{:DNK+DK,JT[Mj,jkk!	 '6 ' 'D#	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	k"""""s%   AA66A:=A:AC55C9<C9	input_idsr   position_idsinputs_embedsr4   r"   c                   ||t          d          |5t          || j        j                   t	          j        | j        |          }t          |          dd         }|0t	          j        t	          j	        d|d                   d          }t	          j        | j
        |          }t	          j        ||d         ddf	          }||z   }|S )
z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5You have to specify either input_ids or inputs_embedsparamsindicesrT   r   )startlimitr   r   )input	multiples)r%  r   r   rM  r(   gatherrK  r   expand_dimsr=   rL  r+   )r   rP  rQ  rR  r   position_embedsfinal_embeddingss          r0   r   zTFGroupViTTextEmbeddings.call  s     !6TUUU *9dk6LMMMIT[)LLLM //4>"(+b/*R*R*RYZ[[[L)4+BLYYY'KPQNTUWXCYZZZ(?:r2   r   r   r   )r   rH  NNN)rP  r   rQ  r   rR  r   r4   r"   )r   r   r   r   r   r   r   r   s   @r0   rE  rE  l  s             # # # # # # #* '+)-*.	                 r2   rE  c                  d     e Zd ZdZd fd	ZddZed             ZddZdddZ		 	 	 d d!dZ
 xZS )"TFGroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.r   r   rI   rF   num_prev_group_tokenr   r   c                    t                      j        d
i | | _        || _        || _        fdt          |          D             | _        |dk    rt          ||d          | _        nd | _        |dk    rO|dk    rIt          j        
                    j        d          t          |j        dz  |d	          g| _        d S d | _        d S )Nc                8    g | ]}t          d |           S z	layers_._r   TFGroupViTEncoderLayerr   ir   s     r0   r   z,TFGroupViTStage.__init__.<locals>.<listcomp>  s.    bbbPQ-f?q??KKKbbbr2   r   
downsample)r   r   r   r   zgroup_projector.0r   rg   zgroup_projector.1r   r   )r   r   r   rI   r   r=   r   r   rk  r   r   r   r   r   group_projector)r   r   rI   rc  r   r   r   r   s    `     r0   r   zTFGroupViTStage.__init__  s    	""6"""
.bbbbUZ[`UaUabbbQ3 /!1!	  DOO #DO!##!(;(;//8MTg/hh"0&2D2I?at  $D    $(D   r2   Nc                   | j         dk    r1|                     d| j         | j        j        fddd          | _        nd | _        | j        rd S d| _        t          | dd           Pt          j        | j	        j
                  5  | j	                            d            d d d            n# 1 swxY w Y   t          | dd           P| j        D ]H}t          j        |j
                  5  |                    d            d d d            n# 1 swxY w Y   It          | d	d           t          j        | j        d         j
                  5  | j        d                             d d | j        j        g           d d d            n# 1 swxY w Y   t          j        | j        d         j
                  5  | j        d                             d            d d d            d S # 1 swxY w Y   d S d S )
Nr   r   r  Tgroup_tokenr5  rk  r   rl  )r   r8  r   r   rn  r   r   r(   r   rk  r   r   r   rl  r   r   layers      r0   r   zTFGroupViTStage.build  s   !###$.0GH#"	  /    D  $D: 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,44((4 & &]5:.. & &KK%%%& & & & & & & & & & & & & & &4*D11=t3A6;<< U U$Q'--tT4;;R.STTTU U U U U U U U U U U U U U Ut3A6;<< 4 4$Q'--d3334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 >=sH   B**B.1B.(D

D	D	.FF	F	4!G""G&)G&c                    | j         d uS r   )rn  r   s    r0   with_group_tokenz TFGroupViTStage.with_group_token  s    t++r2   r   r"   r4   c                h    | j         r(|d d d | j         f         |d d | j         d f         fS |d fS r   )rr  r   )r   r   s     r0   split_xzTFGroupViTStage.split_x  sU      	QQQ/4////0!AAA8L7L7N7N4N2OOOd7Nr2   rn  r   c                :    ||S t          j        ||gd          S )Nr   r   )r(   concat)r   r   rn  s      r0   concat_xzTFGroupViTStage.concat_x  s'    Hy![)2222r2   Fhidden_statesprev_group_tokenoutput_attentionsrX   r   rz   c                   | j         rSt          j        | j        t	          |          d         ddf          }| j        | j        D ]} ||          }||z   }nd}|}|                     ||          }| j        D ]} ||ddd          }	|	d         }|                     |          \  }}d}
| j	        | 	                    ||          \  }}
||f}|r||
fz   }|S )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   r   )rZ  N)attention_maskcausal_attention_maskrz  )
rr  r(   r+   rn  r   rl  rw  r   rt  rk  )r   rx  ry  rz  r   rn  rp  r   cat_x	layer_outr  outputss               r0   r   zTFGroupViTStage.call  s1       	'$"2z-?X?XYZ?[]^`a>bcccK#/!1 ? ?E',u-='>'>$$),<<Ka--[ 	! 	!E#&*"&	  I aLEEe,,;	?&??1k::LAyk" 	-,Gr2   )
r   r   rI   rF   rc  rF   r   rF   r   rF   r   )r   r"   r4   r"   )r   r"   rn  r   r4   r"   )NFF)
rx  r"   ry  r   rz  rX   r   rX   r4   rz   )r   r   r   r   r   r   propertyrr  rt  rw  r   r   r   s   @r0   rb  rb    s        WW!( !( !( !( !( !(F4 4 4 46 , , X,   3 3 3 3 3 .2"'/ / / / / / / / /r2   rb  c                  :     e Zd Z	 	 	 dd fdZdddZddZ xZS )r   Nr   r   r   r$   intermediate_sizeoutput_sizec                f    t                      j        di | || _        t          |j                  | _        ||n|j        }||n|j        }||n|}t          j	        
                    |d          | _        t          j	        
                    |d          | _        || _        || _        d S )Nfc1r   fc2r   )r   r   r   r   
hidden_actactivation_fnr   r  r   r   r   r  r  )r   r   r   r  r  r   r   s         r0   r   zTFGroupViTMLP.__init__)  s     	""6""".v/@AA%0%<kk&BT1B1N--TZTl%0%<kk+<%%&7e%DD<%%k%>>!2&r2   Frx  r"   r   rX   r4   c                    |                      |          }|                     |          }|                     |          }|S r   )r  r  r  )r   rx  r   s      r0   r   zTFGroupViTMLP.call<  s=    //**=99//r2   c                   | j         rd S d| _         t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j	        g           d d d            d S # 1 swxY w Y   d S d S )NTr  r  )
r   r   r(   r   r  r   r   r   r  r  r   s     r0   r   zTFGroupViTMLP.buildB  su   : 	F
4%%1tx}-- ? ?dD,<=>>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ?4%%1tx}-- E EdD,BCDDDE E E E E E E E E E E E E E E E E E 21s$    #A//A36A3)#CC Cr`  )r   r   r   r$   r  r$   r  r$   r   )rx  r"   r   rX   r4   r"   r   r   r   s   @r0   r   r   (  s         #'(,"&' ' ' ' ' ' '&    	E 	E 	E 	E 	E 	E 	E 	Er2   r   c                  "     e Zd Zdd fdZ xZS )r   Fr   rX   c                    t                                          t          j        |d                    }t          j        |d          S )Nr~   ri   rx  )r   r   r(   rA   )r   r   r   r   s      r0   r   zTFGroupViTMixerMLP.callO  s>    GGLLr|AI'F'F'FLGG|AI....r2   r   )r   rX   )r   r   r   r   r   r   s   @r0   r   r   N  sB        / / / / / / / / / / /r2   r   c                  H     e Zd ZdZd fdZdd
Z	 	 	 	 	 dddZddZ xZS )r   z=Multi-headed attention from 'Attention Is All You Need' paperr   r   c                    t                      j        di | |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          |j        }| j        dz  d|j        z  dz  z  |z  }| j        dz  |z  }t          j
        | j                  | _        t          j                            | j        t          |          d          | _        t          j                            | j        t          |          d          | _        t          j                            | j        t          |          d	          | _        t          j                            |j        
          | _        t          j                            | j        t          |          d          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r!  r   rg   r   )unitsr  r   r   r   )r0  out_projr   )r   r   r   rG  num_attention_headsattention_head_sizer%  rN  num_hidden_layersr9   r=  sqrt_att_head_sizer   r   r   r   r   r   r   r2  attention_dropoutr/  r  )r   r   r   factorin_proj_stdout_proj_stdr   s         r0   r   zTFGroupViTAttention.__init__X  s   ""6"""+#)#= #'>T5M#M #d&>>$.PP1dn 1 1,1 1 1  
 *~t+V5M1MRV0VWZ``,6"&)D,D"E"El((._[5Q5QX` ) 
 
 l((._[5Q5QX` ) 
 
 l((._[5Q5QX` ) 
 
 |++1I+JJ**._\5R5RYc + 
 
r2   r#  r"   rw   rF   r4   c                z    t          j        ||d| j        | j        f          }t          j        |g d          S )NrT   r"  r   rg   r   r   ri   )r(   rp   r  r  rA   )r   r#  rw   s      r0   transpose_for_scoresz(TFGroupViTAttention.transpose_for_scores{  sA    6*b$BZ\`\t1uvvv |F6666r2   NFrx  r|  r   r}  rz  bool | Noner   r   rX   rz   c                   t          |          d         }|du}|                     |          }	|r-|                     |          }
|                     |          }n,|                     |          }
|                     |          }|                     |	|          }|                     |
|          }|                     ||          }t          j        ||d          }t          j        | j        |j	                  }t          j
        ||          }|t          j        ||          }|t          j        ||          }t          |d          }|                     |          }t          j        ||          }t          j        |g d	
          }t          j        ||d| j        f          }|                     |          }|r||fn|f}|S )z#Input shape: Batch x Time x Channelr   NinputsTr   r&   rT   )r3   rJ   r  ri   r"  )r   r   r   r   r  r(   r   r*   r  r'   divideaddr   r/  rA   rp   rG  r  )r   rx  r|  r}  rz  r   r   rw   is_cross_attentionmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresdk_attention_probsattention_probsattention_outputr  s                        r0   r   zTFGroupViTAttention.call  s     ..q1
2$> KK}K== 	B"kk1FkGGO $3H I I"kkk??O $= A A//0A:NN--ozJJ	//0A:NN 9[)NNNWT,4D4JKKK9%5r:: !,!v&68MNN%!v&6GG *1AKKK ,,.>,??9_kBB<(8|||LLL :-=jRTVZVdEefff==)9:: ;Ld#%566RbQdr2   c                t   | j         rd S d| _         t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j	        j                  5  | j	                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j
        j                  5  | j
                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   r  )r   r   r(   r   r   r   r   rG  r   r   r  r   s     r0   r   zTFGroupViTAttention.build  s   : 	F
44((4t{/00 @ @!!4t~">???@ @ @ @ @ @ @ @ @ @ @ @ @ @ @44((4t{/00 @ @!!4t~">???@ @ @ @ @ @ @ @ @ @ @ @ @ @ @44((4t{/00 @ @!!4t~">???@ @ @ @ @ @ @ @ @ @ @ @ @ @ @4T**6t}122 B B##T4$@AAAB B B B B B B B B B B B B B B B B B 76sH    #A//A36A3)#CCC#EEE;#F++F/2F/r(  )r#  r"   rw   rF   r4   r"   NNNNF)rx  r"   r|  r   r}  r   rz  r  r   r   r   rX   r4   rz   r   )	r   r   r   r   r   r  r   r   r   r   s   @r0   r   r   U  s        GG 
  
  
  
  
  
F7 7 7 7 ,026)-26; ; ; ; ;zB B B B B B B Br2   r   c                  4     e Zd Zd fdZ	 dddZddZ xZS )rh  r   r   c                V    t                      j        di | |j        | _        t	          |d          | _        t          j                            |j	        d          | _
        t          |d          | _        t          j                            |j	        d          | _        d S )N	self_attnr   layer_norm1r   r   layer_norm2r   )r   r   r   rG  r   r  r   r   r   r   r  r   r   r  r   s      r0   r   zTFGroupViTEncoderLayer.__init__  s    ""6"""+,V+FFF <::6CX_l:mm e444 <::6CX_l:mmr2   Frx  r"   r|  r}  rz  rX   r   r4   rz   c                   |}|                      |          }|                     |||||          }|d         }||z   }|}|                     |          }|                     |          }||z   }|f|dd         z   }|S )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            causal_attention_mask (`tf.Tensor`): causal attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`):
                Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
                tensors for more detail.
        r  )rx  r|  r}  rz  r   r   r  r   N)r  r  r  r   )	r   rx  r|  r}  rz  r   residualattention_outputsr  s	            r0   r   zTFGroupViTEncoderLayer.call  s    & !(((>> NN')"7/ + 
 
 *!, =0 (((>>}== =0 "%6qrr%::r2   Nc                T   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j	        j                  5  | j	                            d            d d d            n# 1 swxY w Y   t          | dd           [t          j        | j
        j                  5  | j
                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S )NTr  r  r   r  )r   r   r(   r   r  r   r   r  rG  r   r  r   s     r0   r   zTFGroupViTEncoderLayer.build  s   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4--9t/455 E E &&dDN'CDDDE E E E E E E E E E E E E E E4%%1tx}-- % %t$$$% % % % % % % % % % % % % % %4--9t/455 E E &&dDN'CDDDE E E E E E E E E E E E E E E E E E :9sH    A''A+.A+!#CCC
D11D58D5+#FF"Fr(  r   )rx  r"   r|  r"   r}  r"   rz  rX   r   rX   r4   rz   r   r   r   s   @r0   rh  rh    s~        n n n n n n ' ' ' ' 'RE E E E E E E Er2   rh  c                  4     e Zd Zd fdZ	 dddZddZ xZS )TFGroupViTTextEncoderr   r   c                     t                      j        di | fdt          j                  D             | _        d S )Nc                8    g | ]}t          d |           S rf  rg  ri  s     r0   r   z2TFGroupViTTextEncoder.__init__.<locals>.<listcomp>  s.    uuuPQ-f?q??KKKuuur2   r   )r   r   r=   r  r   r   s    ` r0   r   zTFGroupViTTextEncoder.__init__  sM    ""6"""uuuuUZ[a[sUtUtuuur2   Fr|  r"   r}  rz  rX   output_hidden_statesreturn_dictr   r4   tuple | TFBaseModelOutputc                   |rdnd }|rdnd }	t          | j                  D ]2\  }
}|r||fz   } |||||          }|d         }|r|	|d         fz   }	3|r||fz   }|st          d |||	fD                       S t          |||	          S )Nr   )rz  r   r   c              3     K   | ]}||V  	d S r   r   r   vs     r0   r   z-TFGroupViTTextEncoder.call.<locals>.<genexpr>>  s(      eeqWXWdWdWdWdWdeer2   last_hidden_staterx  rb   )	enumerater   r   r   )r   rx  r|  r}  rz  r  r  r   encoder_statesall_attentionsidxencoder_layerlayer_outputss                r0   r   zTFGroupViTTextEncoder.call  s     4=0:d"+DK"8"8 	F 	FC# C!/=2B!B)M%"3	  M *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee +>Vd
 
 
 	
r2   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTr   )r   r   r   r(   r   r   r   ro  s      r0   r   zTFGroupViTTextEncoder.buildC      : 	F
44((4 & &]5:.. & &KK%%%& & & & & & & & & & & & & & & 54& &   A&&A*	-A*	r_  r   )r|  r"   r}  r"   rz  rX   r  rX   r  rX   r   rX   r4   r  r   r   r   s   @r0   r  r    sv        v v v v v v #
 #
 #
 #
 #
J& & & & & & & &r2   r  c                  4     e Zd Zd fdZ	 dddZddZ xZS )TFGroupViTVisionEncoderr   r   r4   Nonec                     t                      j        di | fdt          t          j                            D             | _        d S )Nc                    g | ]Q}t          j        |         j        |         j        |         |d k    rj        |dz
           nd d|           RS )r   r   z	stages_._)r   rI   r   r   rc  r   )rb  depthsnum_group_tokensnum_output_groupsri  s     r0   r   z4TFGroupViTVisionEncoder.__init__.<locals>.<listcomp>Q  s     

 

 

  mA& & 7 :!'!9!!<HIAV%=a!e%D%DST$__  

 

 

r2   r   )r   r   r=   rM   r  stagesr   s    ` r0   r   z TFGroupViTVisionEncoder.__init__N  sc    ""6"""

 

 

 

 3v}--..

 

 

r2   Frx  r"   r  rX   rz  r  r   r  c                   |rdnd }|rdnd }d }| j         D ]=}	|r||fz   } |	|||          }
|
d         }|
d         }|r|
d         ||
d         fz   }>|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r   rg   c              3     K   | ]}||V  	d S r   r   r  s     r0   r   z/TFGroupViTVisionEncoder.call.<locals>.<genexpr>z  s(      ggqYZYfYfYfYfYfggr2   r  )r  r   r   )r   rx  r  rz  r  r   all_hidden_statesall_groupingsr   stager  s              r0   r   zTFGroupViTVisionEncoder.call]  s    #7@BBD/9T[ 
	D 
	DE# I$58H$H!!E-?PQQM)!,M(+L  D]1%5%A -q1A0C C 	E 1]4D D 	hgg]4E}$Ugggggg +;LYf
 
 
 	
r2   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTr  )r   r   r  r(   r   r   r   ro  s      r0   r   zTFGroupViTVisionEncoder.build  r  r  )r   r   r4   r  r   )rx  r"   r  rX   rz  rX   r  rX   r   rX   r4   r  r   r   r   s   @r0   r  r  M  sp        
 
 
 
 
 
*  
  
  
  
  
D& & & & & & & &r2   r  c                  H     e Zd Zd fdZ	 dddZej        fdZddZ xZ	S )TFGroupViTTextTransformerr   r   c                    t                      j        di | t          |d          | _        t	          |d          | _        t          j                            |j	        d          | _
        |j        | _        |j        | _        d S )Nr&  r   encoderfinal_layer_normr   r   )r   r   rE  r&  r  r  r   r   r   r   r  eos_token_idr   rG  r   s      r0   r   z"TFGroupViTTextTransformer.__init__  s    ""6"""26MMM,V)DDD % ? ?H]dv ? w w #/+r2   FrP  r
   r|  r"   rQ  rz  rX   r  r  r   r4   /TFBaseModelOutputWithPooling | tuple[tf.Tensor]c                   t          |          }|                     ||          }	|\  }
}|                     |
||	j                  }t	          |          }|                     |	||||||          }|d         }|                     |          }| j        dk    rpt          j	        |t          j
        t          j        |d         t          j                  t          j                            |d          fd	
                    }nt          j	        |t          j
        t          j        |d         t          j                  t          j                            t          j        || j        k    t          j                  d          fd	
                    }|s||f|d	d          z   S t#          |||j        |j                  S )N)rP  rQ  r&   )rx  r|  r}  rz  r  r  r   r   r  rg   rT   r   r   )valuesrJ   rT  r  pooler_outputrx  rb   )r   r&  _build_causal_attention_maskr'   r1   r  r  r  r(   	gather_ndstackr=   int64r9   rK   r*   int8r	   rx  rb   )r   rP  r|  rQ  rz  r  r  r   r   embedding_outputrw   
seq_lengthr}  encoder_outputssequence_outputpooled_outputs                   r0   r   zTFGroupViTTextTransformer.call  s    !++??Y\?ZZ!,
J !% A A*j`p`v A w w &n55,,*)"7/!5# ' 
 
 *!,///GG!! L&H[^28DDDbgnnU^egnFhFhipq    MM L&Qrx@@@rwyD<M/MUWU\']']']dfgg   	 	 	M  	J#]3oabb6III+-')7&1	
 
 
 	
r2   c                `   t          j        t          j        |fd          |          }t          j        t          j        ||fd          |          }t           j                            |dd          }t           j                            ||          }t          j        ||d||f          S )Nr   g     r   rT   )diagonalr   )rY  r^   )r(   r*   filllinalg	band_partset_diagbroadcast_to)r   rw   r  r'   diagto_masks         r0   r  z6TFGroupViTTextTransformer._build_causal_attention_mask  s    
 wrw
}c22E:: '"':z":HEEuMM )%%gq"55)$$Wt$<<WZJPZ4[\\\\r2   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j	        g           d d d            d S # 1 swxY w Y   d S d S )NTr&  r  r  )
r   r   r(   r   r&  r   r   r  r  rG  r   s     r0   r   zTFGroupViTTextTransformer.build  s   : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4+T22>t49:: J J%++T4,HIIIJ J J J J J J J J J J J J J J J J J ?>6    A''A+.A+!CCC#D22D69D6r_  r   )rP  r
   r|  r"   rQ  r"   rz  rX   r  rX   r  rX   r   rX   r4   r  r   )
r   r   r   r   r   r(   float32r  r   r   r   s   @r0   r  r    s        	, 	, 	, 	, 	, 	,& E
 E
 E
 E
 E
N JL ] ] ] ]$J J J J J J J Jr2   r  c                  4     e Zd Zd fdZ	 dddZddZ xZS )TFGroupViTVisionTransformerr   r   c                     t                      j        di | t          |d          | _        t	          |d          | _        t          j                            |j	        d          | _
        |j        | _        d S )Nr&  r   r  r1  r   r   )r   r   r,  r&  r  r  r   r   r   r   r1  r   rG  r   s      r0   r   z$TFGroupViTVisionTransformer.__init__   sv    ""6"""4V,OOO.vIFFF88AV]h8ii+r2   Fr  r
   rz  rX   r  r  r   r4   $tuple | TFBaseModelOutputWithPoolingc                6   |                      |          }|                     ||||          }|d         }|                     |          }t          j                            |d          }	|s||	f|dd          z   S t          ||	|j        |j                  S )N)rx  r  rz  r  r   r   r   r  )	r&  r  r1  r(   r9   r:   r	   rx  rb   )
r   r  rz  r  r  r   r  r  r  r  s
             r0   r   z TFGroupViTVisionTransformer.call  s      ??<88,,*!5/#	 ' 
 
 ,A. !NN+<==++,=A+FF 	L%}58KKK+/')7&1	
 
 
 	
r2   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j	        g           d d d            d S # 1 swxY w Y   d S d S )NTr&  r  r1  )
r   r   r(   r   r&  r   r   r  r1  rG  r   s     r0   r   z!TFGroupViTVisionTransformer.build)  s   : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4d++7t~233 C C$$dD$.%ABBBC C C C C C C C C C C C C C C C C C 87r  r   r   )r  r
   rz  rX   r  rX   r  rX   r   rX   r4   r  r   r   r   s   @r0   r  r    sx        , , , , , , 
 
 
 
 
BC C C C C C C Cr2   r  c                  d     e Zd ZeZd fdZddZdd	Ze	 	 	 	 	 	 	 ddd            Z	ddZ
 xZS ) TFGroupViTTextMainLayerr   r   c                t     t                      j        di | || _        t          |d          | _        d S )N
text_modelr   r   )r   r   r   r  r
  r   s      r0   r   z TFGroupViTTextMainLayer.__init__=  s?    ""6"""3FNNNr2   r4   keras.layers.Layerc                    | j         j        S r   )r
  r&  r   s    r0   get_input_embeddingsz,TFGroupViTTextMainLayer.get_input_embeddingsB  s    ))r2   r   tf.Variablec                p    || j         j        _        t          |          d         | j         j        _        d S )Nr   )r
  r&  rK  r   rM  )r   r   s     r0   set_input_embeddingsz,TFGroupViTTextMainLayer.set_input_embeddingsE  s/    ,1")0:50A0A!0D"---r2   NFrP  TFModelInputType | Noner|  np.ndarray | tf.Tensor | NonerQ  rz  r  r  r  r   rX   r  c           	         |t          d          t          |          }|t          j        |d          }|                     |||||||          }	|	S )NzYou have to specify input_idsr   dimsr   rP  r|  rQ  rz  r  r  r   )r%  r   r(   r  r
  )
r   rP  r|  rQ  rz  r  r  r   r   text_model_outputss
             r0   r   zTFGroupViTTextMainLayer.callI  sv     <=== ++!W+Q???N!__)%/!5# - 
 
 "!r2   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr
  )r   r   r(   r   r
  r   r   r   s     r0   r   zTFGroupViTTextMainLayer.buildh  s    : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , , , , , 98    A((A,/A,r_  r4   r  )r   r  NNNNNNFrP  r  r|  r  rQ  r  rz  r  r  r  r  r  r   rX   r4   r  r   )r   r   r   r   config_classr   r  r  r   r   r   r   r   s   @r0   r  r  8  s         &LO O O O O O
* * * *E E E E  .28<6:)-,0#'" " " " ]"<, , , , , , , ,r2   r  c                  X     e Zd ZeZd fdZddZe	 	 	 	 	 ddd            ZddZ	 xZ
S )TFGroupViTVisionMainLayerr   r   c                t     t                      j        di | || _        t          |d          | _        d S )Nvision_modelr   r   )r   r   r   r  r!  r   s      r0   r   z"TFGroupViTVisionMainLayer.__init__v  sB    ""6"""7^TTTr2   r4   r  c                    | j         j        S r   )r!  r&  r   s    r0   r  z.TFGroupViTVisionMainLayer.get_input_embeddings{  s     ++r2   NFr  r  rz  r  r  r  r   rX   r  c                \    |t          d          |                     |||||          }|S )N You have to specify pixel_valuesr  rz  r  r  r   )r%  r!  )r   r  rz  r  r  r   vision_model_outputss          r0   r   zTFGroupViTVisionMainLayer.call~  sK     ?@@@#00%/!5#  1  
  
 $#r2   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr!  )r   r   r(   r   r!  r   r   r   s     r0   r   zTFGroupViTVisionMainLayer.build  s    : 	F
4..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:r  r   r  r  r  r  rz  r  r  r  r  r  r   rX   r4   r  r   )r   r   r   r   r  r   r  r   r   r   r   r   s   @r0   r  r  q  s         (LU U U U U U
, , , ,  15)-,0#'$ $ $ $ ]$*. . . . . . . .r2   r  c                       e Zd ZeZd fdZddZe	 	 	 	 	 	 	 ddd            Ze	 	 	 	 	 dd d            Z	e	 	 	 	 	 	 	 	 	 	 d!d"d            Z
 xZS )#TFGroupViTMainLayerr   r   c                \    t                      j        di | t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          || _	        |j        }|j        }|j
        | _
        |j        | _        |j        | _        |j        | _        t          |d          | _        t#          |d          | _        t&          j                            | j        d          t&          j                            dd	d
          t&          j                            d          t&          j                            | j
        d          g| _        t&          j                            | j        d          t&          j                            dd	d
          t&          j                            d          t&          j                            | j
        d          g| _        d S )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type r
  r   r!  zvisual_projection.0zvisual_projection.1g?gh㈵>)r   momentumr   zvisual_projection.2zvisual_projection.3ztext_projection.0ztext_projection.1ztext_projection.2ztext_projection.3r   )r   r   r   text_configr   	TypeErrortypevision_configr   r   projection_dimprojection_intermediate_dimr   text_embed_dimvision_embed_dimr  r
  r  r!  r   r   r   BatchNormalizationReLUvisual_projectiontext_projection)r   r   r   r.  r1  r   s        r0   r   zTFGroupViTMainLayer.__init__  s   ""6"""&,.@AA 	0+,,0 0 0  
 &.0DEE 	2-..2 2 2  
 (,$3+1+M()5 - 93KlSSS7N[[[ Lt?F[\\L++1FQT^b+ccL#899Lt29NOO	"
 Lt?FYZZL++1Ds\`+aaL#677Lt29LMM	 
r2   Nc                6   |                      dt          j                            | j        j                  dd          | _        | j        rd S d| _        t          | dd           Pt          j
        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j
        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           -t          j
        | j        d         j                  5  | j        d                             d d d | j        g           d d d            n# 1 swxY w Y   t          j
        | j        d	         j                  5  | j        d	                             d | j        f           d d d            n# 1 swxY w Y   t          j
        | j        d
         j                  5  | j        d
                             d d d | j        g           d d d            n# 1 swxY w Y   t          | dd           0t          j
        | j        d         j                  5  | j        d                             d d d | j        g           d d d            n# 1 swxY w Y   t          j
        | j        d	         j                  5  | j        d	                             d | j        f           d d d            n# 1 swxY w Y   t          j
        | j        d
         j                  5  | j        d
                             d d d | j        g           d d d            d S # 1 swxY w Y   d S d S )N)r   Tlogit_scaler5  r
  r!  r8  r   r   r   r9  )r8  r   initializersConstantr   logit_scale_init_valuer;  r   r   r(   r   r
  r   r   r!  r8  r5  r3  r9  r4  r   s     r0   r   zTFGroupViTMainLayer.build  s   ??*33DK4VWW	 + 
 
 : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,4..:t0566 . .!''---. . . . . . . . . . . . . . .4,d33?t5a8=>> [ [&q)//tT4CX0YZZZ[ [ [ [ [ [ [ [ [ [ [ [ [ [ [t5a8=>> Z Z&q)//t7W0XYYYZ Z Z Z Z Z Z Z Z Z Z Z Z Z Zt5a8=>> f f&q)//tT4Cc0deeef f f f f f f f f f f f f f f4*D11=t3A6;<< W W$Q'--tT4AT.UVVVW W W W W W W W W W W W W W Wt3A6;<< X X$Q'--tT5U.VWWWX X X X X X X X X X X X X X Xt3A6;<< d d$Q'--tT4Aa.bcccd d d d d d d d d d d d d d d d d d >=s   B,,B03B0&DDD*FFF3(G''G+.G+*III*KK
K2(L&&L*-L**NNNFrP  r  r|  r  rQ  rz  r  r  r  r   rX   r4   r"   c           	         |t          d          t          |          }|t          j        |d          }|                     |||||||          }	|	d         }
| j        D ]} ||
          }
|
}|S )N$You have to specify either input_idsr   r  r  )r%  r   r(   r  r
  r9  )r   rP  r|  rQ  rz  r  r  r   r   text_outputsr  rp  text_featuress                r0   get_text_featuresz%TFGroupViTMainLayer.get_text_features  s     CDDD ++!W+Q???N)%/!5# ' 
 
 %Q) 	1 	1E!E-00MM%r2   r  c                    |t          d          |                     |||||          }|d         }| j        D ]} ||          }|}	|	S )Nr$  r%  r   )r%  r!  r8  )
r   r  rz  r  r  r   vision_outputsr  rp  image_featuress
             r0   get_image_featuresz&TFGroupViTMainLayer.get_image_features  s|     ?@@@**%/!5# + 
 
 'q)+ 	1 	1E!E-00MM&r2   return_lossoutput_segmentation(TFGroupViTModelOutput | tuple[tf.Tensor]c           
     P   |t          d          |t          d          t          |          }|t          j        |d          }|rd}|                     ||||	|
          }|                     ||||||	|
          }|d         }| j        D ]} ||          }|d         }| j        D ]} ||          }|t          j        |dd	          z  }|t          j        |dd	          z  }t          j	        
                    | j                  }t          j        ||d
          |z  }t          j        |          }d }|r|d         }t          j        |dt          |          d         f          }| j        D ]} ||          }|r	|d         }n|d         }t          ||j        dd                    }|t          j        |ddd          z  }t          j        ||d
          |z  }t          j        ||j        d         d|j        d         f          }t          j        |d          }t          j        |t          |          d         t          |          d         df          }t          j        ||          |z  }t          j        ||j        d         |j        d         |j        d         |j        d         f          }d }|rt#          |          d         }|	s|
|||||||f}n||||||f}||f|z   n|S t%          ||||||||          S )Nr@  r$  r   r  Tr%  r  rT   r   r   r   r:  r   rg   	euclidean)r#  ordrJ   r   r~   ri   )N.)r   r   r   r   r   r   r   r   )r%  r   r(   r  r!  r
  r8  r9  normr9   expr;  r   rA   rp   r   r^   rD   r   )r   rP  r  r|  rQ  rH  rz  r  rI  r  r   r   rE  rA  r   rp  r   r;  r   r   
seg_logitsimage_group_embedsrb   groupinglogits_per_image_groupflatten_groupingr   outputs                               r0   r   zTFGroupViTMainLayer.call*  s    CDDD?@@@ ++!W+Q???N 	% $**%/!5# + 
 
 )%/!5# ' 
 
 &a(+ 	/ 	/E 5..LL"1o) 	- 	-E%,,KK $bgld&S&S&SS!BGKb4$P$P$PP gkk$"233)K4PPPS^^<88
 "	 "0!2!#,>r:VhKiKijlKmFn!o!o!o/ ? ?%*U+=%>%>""# /+A.

+A.
3J@RSTSUSU@VWWH "4bg){d7 7 7 " &(Y/A;\`%a%a%ado%o"%'Z&|/A!/Db+J[\]J^._& & &" &(\2Hy%Y%Y%Y"  "z(:h;O;OPQ;RT^_gThThijTkmo:pqqq #9;KLL{ZJ:#3A#6
8H8KX^\]M^`h`nop`q"r  J  	= 11)<D 	F%$#  " +O[,Xdftu)-)9TGf$$vE$-+ *#%* .	
 	
 	
 		
r2   r(  r   r  rP  r  r|  r  rQ  r  rz  r  r  r  r  r  r   rX   r4   r"   r  r  r  rz  r  r  r  r  r  r   rX   r4   r"   
NNNNNNNNNFrP  r  r  r  r|  r  rQ  r  rH  r  rz  r  r  r  rI  r  r  r  r   rX   r4   rJ  )r   r   r   r   r  r   r   r   rC  rG  r   r   r   s   @r0   r*  r*    s        "L'
 '
 '
 '
 '
 '
Rd d d d@  .28<6:)-,0#'! ! ! ! ]!F  15)-,0#'    ]4  .2048<6:#')-,0+/#'|
 |
 |
 |
 ]|
 |
 |
 |
 |
r2   r*  c                      e Zd ZdZeZdZdS )TFGroupViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    groupvitN)r   r   r   r   r   r  base_model_prefixr   r2   r0   r[  r[    s'         
 "L"r2   r[  aB  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
    first positional argument :

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
      `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
      `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    </Tip>

    Args:
        config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
a  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]`, `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
al
  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
c                       e Zd ZeZdZd fdZe ee	
                    d                     eee          	 	 	 	 	 	 	 ddd                                    ZddZ xZS )TFGroupViTTextModelrP  r   r   c                n     t                      j        |g|R i | t          |d          | _        d S Nr\  r   )r   r   r  r\  r   r   r  r   r   s       r0   r   zTFGroupViTTextModel.__init__A  sB    3&333F333/ZHHHr2   batch_size, sequence_lengthoutput_typer  NFr  r|  r  rQ  rz  r  r  r  r   rX   r4   r  c           	     >    |                      |||||||          }|S )aO  
        Returns:

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, TFGroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = TFGroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r  r\  )	r   rP  r|  rQ  rz  r  r  r   r  s	            r0   r   zTFGroupViTTextModel.callF  s8    > --)%/!5#   
 
 r2   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S NTr\  r   r   r(   r   r\  r   r   r   s     r0   r   zTFGroupViTTextModel.buildq      : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * * * * * 76r  r_  r  r  r   )r   r   r   r   r  main_input_namer   r   r   GROUPVIT_TEXT_INPUTS_DOCSTRINGformatr   r	   r   r   r   r   s   @r0   r_  r_  =  s        %L!OI I I I I I
 **+I+P+PQn+o+opp+GVhiii .28<6:)-,0#'& & & & ji qp ]&P* * * * * * * *r2   r_  c                       e Zd ZeZdZd fdZe ee	           e
ee          	 	 	 	 	 ddd                                    ZddZ xZS )TFGroupViTVisionModelr  r   r   c                n     t                      j        |g|R i | t          |d          | _        d S ra  )r   r   r  r\  rb  s       r0   r   zTFGroupViTVisionModel.__init__~  sB    3&333F3331&zJJJr2   rd  NFr  rz  r  r  r  r   rX   r4   r  c                :    |                      |||||          }|S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFGroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = TFGroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r%  rg  )r   r  rz  r  r  r   r  s          r0   r   zTFGroupViTVisionModel.call  s3    D --%/!5#   
 
 r2   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S ri  rj  r   s     r0   r   zTFGroupViTVisionModel.build  rk  r  r   r  r(  r   )r   r   r   r   r  rl  r   r   r    GROUPVIT_VISION_INPUTS_DOCSTRINGr   r	   r   r   r   r   s   @r0   rp  rp  z  s        'L$OK K K K K K
 **+KLL+GVjkkk 15)-,0#'' ' ' ' lk ML ]'R* * * * * * * *r2   rp  c                      e Zd ZeZd  fdZe ee	                    d                    	 	 	 	 	 	 	 d!d"d                        Z
e ee          	 	 	 	 	 d#d$d                        Ze ee	                    d                     eee          	 	 	 	 	 	 	 	 	 	 d%d&d                                    Zd'dZd(dZ xZS ))TFGroupViTModelr   r   c                n     t                      j        |g|R i | t          |d          | _        d S ra  )r   r   r*  r\  rb  s       r0   r   zTFGroupViTModel.__init__  sB    3&333F333+FDDDr2   rc  NFrP  r  r|  r  rQ  rz  r  r  r  r   rX   r4   r"   c           	     H    | j                             |||||||          }|S )a  
        Returns:
            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
            the projection layer to the pooled output of [`TFGroupViTTextModel`].

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, TFGroupViTModel

        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
        >>> text_features = model.get_text_features(**inputs)
        ```r  )r\  rC  )	r   rP  r|  rQ  rz  r  r  r   rB  s	            r0   rC  z!TFGroupViTModel.get_text_features  s=    : 77)%/!5# 8 
 
 r2   r  c                D    | j                             |||||          }|S )aF  
        Returns:
            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
            the projection layer to the pooled output of [`TFGroupViTVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFGroupViTModel

        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> image_features = model.get_image_features(**inputs)
        ```r%  )r\  rG  )r   r  rz  r  r  r   rF  s          r0   rG  z"TFGroupViTModel.get_image_features  s8    B 99%/!5# : 
 
 r2   rd  rH  rI  rJ  c                D    |                      |||||||||	|

  
        }|S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFGroupViTModel
        >>> import tensorflow as tf

        >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = tf.math.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
        ```)
rP  r  r|  rQ  rH  rz  r  rI  r  r   rg  )r   rP  r  r|  rQ  rH  rz  r  rI  r  r   r  s               r0   r   zTFGroupViTModel.call  sB    T --%)%#/!5 3#   
 
 r2   rU  r   c                    |S r   r   )r   rU  s     r0   serving_outputzTFGroupViTModel.serving_outputN  s	     r2   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S ri  rj  r   s     r0   r   zTFGroupViTModel.buildT  rk  r  r(  r  rV  r  rW  rX  rY  )rU  r   r4   r   r   )r   r   r   r   r  r   r   r   rm  rn  rC  rt  rG  GROUPVIT_INPUTS_DOCSTRINGr   r   r   r|  r   r   r   s   @r0   rv  rv    s       !LE E E E E E
 **+I+P+PQn+o+opp .28<6:)-,0#'% % % % qp ]%N **+KLL 15)-,0#'' ' ' ' ML ]'R **+D+K+KLi+j+jkk+@~^^^ .2048<6:#')-,0+/#'4 4 4 4 _^ lk ]4l   * * * * * * * *r2   rv  )rv  r[  r_  rp  r   )r!   r"   r#   r$   )r3   r"   r4   r"   )r?   r"   r4   r"   )r3   r"   rE   rF   r4   r"   )r   FrT   )
r3   r"   rU   rV   rW   rX   rE   rF   r4   r"   r   )
rb   r"   rc   rF   rd   rF   re   rX   r4   r"   )rb   rz   r{   r|   r4   r"   )Zr   
__future__r   collections.abcr   r9   dataclassesr   typingr   numpyrn   
tensorflowr(   activations_tfr   modeling_tf_outputsr   r	   modeling_tf_utilsr
   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   loggertensorflow_probabilityrZ   r[   NormalrC  ImportErrorerror_CHECKPOINT_FOR_DOCr,   r1   r>   rD   rS   ra   ry   r   r   r   Layerr   r   r   r	  r,  rE  rb  r   r   r   rh  r  r  r  r  r  r  r*  r[  GROUPVIT_START_DOCSTRINGrm  rt  r~  r_  rp  rv  __all__r   r2   r0   <module>r     sL     " " " " " "      ! ! ! ! ! !               / / / / / / R R R R R R R R                S R R R R R R R R R                ] \ \ \ \ \ \ \ \ \ 
	H	%	% '&(( 
,,,, $$C$88 
 
 
u	
 	
 	
 	
 	

,,,, $$C$88    1  
6 
6 
6 
6 
6   - - - -   "    2# # # # #L, , , ,8 /
 /
 /
 /
 /
K /
 /
 /
dL L L L LEL$6 L L LD?G ?G ?G ?G ?G 2 ?G ?G ?GDT. T. T. T. T.EL. T. T. T.pKM KM KM KM KM 2 KM KM KM^N N N N N!3 N N Nd7  7  7  7  7 u|1 7  7  7 t    el(   D#E #E #E #E #EEL& #E #E #EL/ / / / / / / /yB yB yB yB yB%,, yB yB yBzBE BE BE BE BEU\/ BE BE BEL2& 2& 2& 2& 2&EL. 2& 2& 2&j9& 9& 9& 9& 9&el0 9& 9& 9&zqJ qJ qJ qJ qJ 2 qJ qJ qJj6C 6C 6C 6C 6C%,"4 6C 6C 6Cr 4, 4, 4, 4, 4,el0 4, 4, 4,n '. '. '. '. '. 2 '. '. '.T H
 H
 H
 H
 H
%,, H
 H
 H
V# # # # # 1 # # #" H#" J$  *( V:* :* :* :* :*3 :* :* :*z;* ;* ;* ;* ;*5 ;* ;* ;*| .//a* a* a* a* a*/ a* a* 0/a*H k
j
js$    B# #C ?C  C% %C-,C-