
     `i                    x   d Z ddlmZ ddlZddlmZ ddlZddl	m
Z
 ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZmZmZmZmZ d
dlmZ  ej        e          Z dZ!e G d de                      Z" G d dej#        j$                  Z% G d dej#        j$                  Z& G d dej#        j$                  Z' G d dej#        j$                  Z( G d dej#        j$                  Z) G d dej#        j$                  Z* G d dej#        j$                  Z+ G d dej#        j$                  Z, G d d ej#        j$                  Z- G d! d"ej#        j$                  Z. G d# d$ej#        j$                  Z/ G d% d&ej#        j$                  Z0 G d' d(ej#        j$                  Z1 G d) d*ej#        j$                  Z2e G d+ d,ej#        j$                              Z3 G d- d.e          Z4d/Z5d0Z6 ed1e5           G d2 d3e4                      Z7 ed4e5           G d5 d6e4e                      Z8g d7Z9dS )8zTF 2.0 Cvt model.    )annotationsN)	dataclass   )&TFImageClassifierOutputWithNoAttention)TFModelInputTypeTFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	CvtConfigr   c                  >    e Zd ZU dZdZded<   dZded<   dZded<   dS )TFBaseModelOutputWithCLSTokena2  
    Base class for model's outputs.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
    Ntf.Tensor | Nonelast_hidden_statecls_token_valueztuple[tf.Tensor, ...] | Nonehidden_states)__name__
__module____qualname____doc__r   __annotations__r   r        {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/cvt/modeling_tf_cvt.pyr   r   3   sR           +/....(,O,,,,26M666666r#   r   c                  .     e Zd ZdZd	 fdZd
ddZ xZS )TFCvtDropPathzDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    	drop_probfloatc                H     t                      j        di | || _        d S )Nr"   )super__init__r'   )selfr'   kwargs	__class__s      r$   r+   zTFCvtDropPath.__init__N   s+    ""6""""r#   Nx	tf.Tensorc                H   | j         dk    s|s|S d| j         z
  }t          j        |          d         fdt          t          j        |                    dz
  z  z   }|t          j                            |dd| j                  z   }t          j        |          }||z  |z  S )N        r   r   )r   )dtype)r'   tfshapelenrandomuniformcompute_dtypefloor)r,   r/   training	keep_probr5   random_tensors         r$   callzTFCvtDropPath.callR   s    >S   H&	!Q!DC,<,<q,@$AA!BI$5$5eQI[$5$\$\\//I..r#   )r'   r(   N)r/   r0   )r   r   r   r    r+   r>   __classcell__r.   s   @r$   r&   r&   H   s`         
# # # # # #/ / / / / / / / /r#   r&   c                  6     e Zd ZdZd fdZdddZddZ xZS )TFCvtEmbeddingsz-Construct the Convolutional Token Embeddings.configr   
patch_sizeintnum_channels	embed_dimstridepaddingdropout_rater(   c           	          t                      j        di | t          ||||||d          | _        t          j                            |          | _        d S )Nconvolution_embeddings)rE   rG   rH   rI   rJ   namer"   )r*   r+   TFCvtConvEmbeddingsrM   r   layersDropoutdropout)
r,   rD   rE   rG   rH   rI   rJ   rK   r-   r.   s
            r$   r+   zTFCvtEmbeddings.__init___   sk     	""6"""&9!%)'
 '
 '
# |++L99r#   Fpixel_valuesr0   r;   boolreturnc                ^    |                      |          }|                     ||          }|S Nr;   )rM   rR   )r,   rS   r;   hidden_states       r$   r>   zTFCvtEmbeddings.callv   s0    22<@@||L8|DDr#   Nc                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTrM   )builtgetattrr4   
name_scoperM   rN   buildr,   input_shapes     r$   r^   zTFCvtEmbeddings.build{       : 	F
41488Dt:?@@ 8 8+11$7778 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 ED    A((A,/A,)rD   r   rE   rF   rG   rF   rH   rF   rI   rF   rJ   rF   rK   r(   F)rS   r0   r;   rT   rU   r0   r?   r   r   r   r    r+   r>   r^   r@   rA   s   @r$   rC   rC   \   sp        77: : : : : :.    
8 8 8 8 8 8 8 8r#   rC   c                  4     e Zd ZdZd fd
ZddZddZ xZS )rO   zcImage to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.rD   r   rE   rF   rG   rH   rI   rJ   c           
         t                      j        d	i | t          j                            |          | _        t          |t          j        j	                  r|n||f| _
        t          j                            |||ddt          |j                  d          | _        t          j                            dd          | _        || _        || _        d S )
NrJ   validchannels_last
projection)filterskernel_sizestridesrJ   data_formatkernel_initializerrN   h㈵>normalizationepsilonrN   r"   )r*   r+   r   rP   ZeroPadding2DrJ   
isinstancecollectionsabcIterablerE   Conv2Dr
   initializer_rangerj   LayerNormalizationrq   rG   rH   )	r,   rD   rE   rG   rH   rI   rJ   r-   r.   s	           r$   r+   zTFCvtConvEmbeddings.__init__   s     	""6"""|11'1BB(2:{?W(X(Xv**_iku^v,--"'.v/GHH . 
 
 #\<<TP_<``("r#   rS   r0   rU   c                R   t          |t                    r|d         }|                     |                     |                    }t	          |          \  }}}}||z  }t          j        ||||f          }|                     |          }t          j        |||||f          }|S )NrS   r5   )ru   dictrj   rJ   r   r4   reshaperq   )r,   rS   
batch_sizeheightwidthrG   hidden_sizes          r$   r>   zTFCvtConvEmbeddings.call   s    lD)) 	8'7Lt||L'A'ABB 3=\2J2J/
FE<unz,z;P\6]^^^)),77 z,z65R^6_```r#   Nc                   | j         rd S d| _         t          | dd           Yt          j        | j        j                  5  | j                            d d d | j        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j	        g           d d d            d S # 1 swxY w Y   d S d S )NTrj   rq   )
r[   r\   r4   r]   rj   rN   r^   rG   rq   rH   r_   s     r$   r^   zTFCvtConvEmbeddings.build   s   : 	F
4t,,8t344 M M%%tT49J&KLLLM M M M M M M M M M M M M M M4$//;t1677 G G"(($dn)EFFFG G G G G G G G G G G G G G G G G G <;s$    $A00A47A4*#CC!C)rD   r   rE   rF   rG   rF   rH   rF   rI   rF   rJ   rF   )rS   r0   rU   r0   r?   rd   rA   s   @r$   rO   rO      ss        mm# # # # # #6    	G 	G 	G 	G 	G 	G 	G 	Gr#   rO   c                  6     e Zd ZdZd fd	ZdddZddZ xZS ) TFCvtSelfAttentionConvProjectionzConvolutional projection layer.rD   r   rH   rF   rl   rI   rJ   c           
     \    t                      j        d
i | t          j                            |          | _        t          j                            ||t          |j                  d|dd|          | _	        t          j        
                    ddd	          | _        || _        d S )Nrg   rh   Fconvolution)rk   rl   ro   rJ   rm   use_biasrN   groupsrp   g?rq   )rs   momentumrN   r"   )r*   r+   r   rP   rt   rJ   ry   r
   rz   r   BatchNormalizationrq   rH   )r,   rD   rH   rl   rI   rJ   r-   r.   s          r$   r+   z)TFCvtSelfAttentionConvProjection.__init__   s    ""6"""|11'1BB <..#.v/GHH / 	
 	
 #\<<TTW^m<nn"r#   FrY   r0   r;   rT   rU   c                    |                      |                     |                    }|                     ||          }|S rW   )r   rJ   rq   r,   rY   r;   s      r$   r>   z%TFCvtSelfAttentionConvProjection.call   s?    ''\(B(BCC)),)JJr#   Nc                   | j         rd S d| _         t          | dd           Yt          j        | j        j                  5  | j                            d d d | j        g           d d d            n# 1 swxY w Y   t          | dd           \t          j        | j        j                  5  | j                            d d d | j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   rq   )	r[   r\   r4   r]   r   rN   r^   rH   rq   r_   s     r$   r^   z&TFCvtSelfAttentionConvProjection.build   s   : 	F
4--9t/455 K K &&dD$.'IJJJK K K K K K K K K K K K K K K4$//;t1677 M M"(($dDN)KLLLM M M M M M M M M M M M M M M M M M <;s$    $A00A47A4*$CC"C)
rD   r   rH   rF   rl   rF   rI   rF   rJ   rF   rc   rY   r0   r;   rT   rU   r0   r?   rd   rA   s   @r$   r   r      sx        ))# # # # # #"    
	M 	M 	M 	M 	M 	M 	M 	Mr#   r   c                      e Zd ZdZddZdS )"TFCvtSelfAttentionLinearProjectionz7Linear projection layer used to flatten tokens into 1D.rY   r0   rU   c                j    t          |          \  }}}}||z  }t          j        ||||f          }|S )Nr}   )r   r4   r   )r,   rY   r   r   r   rG   r   s          r$   r>   z'TFCvtSelfAttentionLinearProjection.call   sC    2<\2J2J/
FE<unz,z;P\6]^^^r#   NrY   r0   rU   r0   )r   r   r   r    r>   r"   r#   r$   r   r      s.        AA     r#   r   c                  :     e Zd ZdZ	 dd fdZdddZddZ xZS )TFCvtSelfAttentionProjectionz'Convolutional Projection for Attention.dw_bnrD   r   rH   rF   rl   rI   rJ   projection_methodstrc                     t                      j        di | |dk    rt          |||||d          | _        t	                      | _        d S )Nr   convolution_projectionrN   r"   )r*   r+   r   r   r   linear_projection)	r,   rD   rH   rl   rI   rJ   r   r-   r.   s	           r$   r+   z%TFCvtSelfAttentionProjection.__init__   si     	""6"""''*J	;F^+ + +D' "D!E!Er#   FrY   r0   r;   rT   rU   c                ^    |                      ||          }|                     |          }|S rW   )r   r   r   s      r$   r>   z!TFCvtSelfAttentionProjection.call  s3    22<(2SS--l;;r#   Nc                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr   )r[   r\   r4   r]   r   rN   r^   r_   s     r$   r^   z"TFCvtSelfAttentionProjection.build  ra   rb   )r   )rD   r   rH   rF   rl   rF   rI   rF   rJ   rF   r   r   rc   r   r?   rd   rA   s   @r$   r   r      s        11 ")F F F F F F F"    
8 8 8 8 8 8 8 8r#   r   c                  B     e Zd ZdZ	 d d! fdZd"dZd#d$dZd%dZ xZS )&TFCvtSelfAttentionz
    Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
    query, key, and value embeddings.
    TrD   r   	num_headsrF   rH   rl   stride_q	stride_kv	padding_q
padding_kvqkv_projection_methodr   qkv_biasrT   attention_drop_rater(   with_cls_tokenc           	         t                      j        di | |dz  | _        || _        || _        || _        t          ||||||	dk    rdn|	d          | _        t          ||||||	d          | _        t          ||||||	d          | _	        t          j                            |t          |j                  |
dd	
          | _        t          j                            |t          |j                  |
dd
          | _        t          j                            |t          |j                  |
dd
          | _        t          j                            |          | _        d S )Ng      avglinearconvolution_projection_query)r   rN   convolution_projection_keyconvolution_projection_valuezerosprojection_queryunitsro   r   bias_initializerrN   projection_keyprojection_valuer"   )r*   r+   scaler   rH   r   r   r   r   r   r   rP   Denser
   rz   r   r   r   rQ   rR   )r,   rD   r   rH   rl   r   r   r   r   r   r   r   r   r-   r.   s                 r$   r+   zTFCvtSelfAttention.__init__  s     	""6"""_
,"",H*?5*H*HhhNc/-
 -
 -
) +G3-+
 +
 +
' -I3/-
 -
 -
) !& 2 2.v/GHH$# !3 !
 !
 $l00.v/GHH$! 1 
 
 !& 2 2.v/GHH$# !3 !
 !
 |++,?@@r#   rY   r0   rU   c                    t          |          \  }}}| j        | j        z  }t          j        |||| j        |f          }t          j        |d          }|S )Nr}   r      r   r   perm)r   rH   r   r4   r   	transpose)r,   rY   r   r   _head_dims         r$   "rearrange_for_multi_head_attentionz5TFCvtSelfAttention.rearrange_for_multi_head_attention_  s`    %/%=%="
K>T^3z,z;PTP^`h6ijjj|L|DDDr#   Fr   r   r;   c                ,   | j         rt          j        |d||z  gd          \  }}t          |          \  }}}t          j        |||||f          }|                     ||          }	|                     ||          }
|                     ||          }| j         rHt          j        ||
fd          }
t          j        ||	fd          }	t          j        ||fd          }| j	        | j
        z  }|                     |                     |
                    }
|                     |                     |	                    }	|                     |                     |                    }t          j        |
|	d          | j        z  }t#          |d          }|                     ||          }t          j        ||          }t          |          \  }}}}t          j        |d	
          }t          j        |||| j
        |z  f          }|S )Nr   r}   rX   axisT)transpose_b)logitsr   r   r   )r   r4   splitr   r   r   r   r   concatrH   r   r   r   r   r   matmulr   r   rR   r   )r,   rY   r   r   r;   	cls_tokenr   r   rG   keyqueryvaluer   attention_scoreattention_probscontextr   s                    r$   r>   zTFCvtSelfAttention.callf  s    	U&(h|a%=PRS&T&T#I| 1;<0H0H-
Kz,z65R^6_```--lX-NN11,1RR11,1RR 	:Iy%0q999E)Y,1555CIy%0q999E>T^3778M8Me8T8TUU55d6I6I#6N6NOO778M8Me8T8TUU)E3DAAADJN(bIII,,,JJ)OU33)'221k1,w\:::*Wz;QY@Y&Z[[r#   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j	        j                  5  | j	                            d d | j
        g           d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j        j                  5  | j                            d d | j
        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j
        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   r   r   r   )r[   r\   r4   r]   r   rN   r^   r   r   r   rH   r   r   r_   s     r$   r^   zTFCvtSelfAttention.build  s   : 	F
47>>Jt@EFF > >177===> > > > > > > > > > > > > > >45t<<Ht>CDD < </55d;;;< < < < < < < < < < < < < < <47>>Jt@EFF > >177===> > > > > > > > > > > > > > >4+T22>t49:: J J%++T4,HIIIJ J J J J J J J J J J J J J J4)400<t2788 H H#))4t~*FGGGH H H H H H H H H H H H H H H4+T22>t49:: J J%++T4,HIIIJ J J J J J J J J J J J J J J J J J ?>sl    A''A+.A+!CCCD))D-0D-##FFF#G;;G?G?5#I%%I),I)T)rD   r   r   rF   rH   rF   rl   rF   r   rF   r   rF   r   rF   r   rF   r   r   r   rT   r   r(   r   rT   r   rc   
rY   r0   r   rF   r   rF   r;   rT   rU   r0   r?   )	r   r   r   r    r+   r   r>   r^   r@   rA   s   @r$   r   r     s         $  $GA GA GA GA GA GA GAR            DJ J J J J J J Jr#   r   c                  6     e Zd ZdZd fdZdddZddZ xZS )TFCvtSelfOutputzOutput of the Attention layer .rD   r   rH   rF   	drop_rater(   c                    t                      j        di | t          j                            |t          |j                  d          | _        t          j                            |          | _	        || _
        d S Ndense)r   ro   rN   r"   )r*   r+   r   rP   r   r
   rz   r   rQ   rR   rH   )r,   rD   rH   r   r-   r.   s        r$   r+   zTFCvtSelfOutput.__init__  su    ""6"""\''@X0Y0Y`g ( 
 

 |++I66"r#   FrY   r0   r;   rT   rU   c                `    |                      |          }|                     ||          }|S N)inputs)r   r;   r   rR   r   s      r$   r>   zTFCvtSelfOutput.call  s0    zzz66||<(|KKr#   Nc                    | j         rd S d| _         t          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S NTr   r[   r\   r4   r]   r   rN   r^   rH   r_   s     r$   r^   zTFCvtSelfOutput.build      : 	F
4$''3tz// ? ?
  $dn!=>>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 43    #A00A47A4)rD   r   rH   rF   r   r(   rc   r   r?   rd   rA   s   @r$   r   r     sp        ))# # # # # #    
? ? ? ? ? ? ? ?r#   r   c                  @     e Zd ZdZ	 d d! fdZd Zd"d#dZd$dZ xZS )%TFCvtAttentionzDAttention layer. First chunk of the convolutional transformer block.TrD   r   r   rF   rH   rl   r   r   r   r   r   r   r   rT   r   r(   r   r   c                     t                      j        di | t          |||||||||	|
||d          | _        t	          |||d          | _        d S )N	attentionr   outputr"   )r*   r+   r   r   r   dense_output)r,   rD   r   rH   rl   r   r   r   r   r   r   r   r   r   r-   r.   s                  r$   r+   zTFCvtAttention.__init__  s    " 	""6"""+!
 
 
 ,FIyxXXXr#   c                    t           r?   )NotImplementedError)r,   headss     r$   prune_headszTFCvtAttention.prune_heads  s    !!r#   FrY   r0   r   r   r;   c                f    |                      ||||          }|                     ||          }|S rW   )r   r   )r,   rY   r   r   r;   self_outputattention_outputs          r$   r>   zTFCvtAttention.call  s:    nn\658nTT,,[8,LLr#   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr   r   )r[   r\   r4   r]   r   rN   r^   r   r_   s     r$   r^   zTFCvtAttention.build  sU   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:s$    A''A+.A+!C		CCr   )rD   r   r   rF   rH   rF   rl   rF   r   rF   r   rF   r   rF   r   rF   r   r   r   rT   r   r(   r   r(   r   rT   rc   )rY   r0   r   rF   r   rF   r;   rT   r?   )	r   r   r   r    r+   r   r>   r^   r@   rA   s   @r$   r   r     s        NN   $!Y !Y !Y !Y !Y !Y !YF" " "         
	. 	. 	. 	. 	. 	. 	. 	.r#   r   c                  4     e Zd ZdZd fdZddZddZ xZS )TFCvtIntermediatezNIntermediate dense layer. Second chunk of the convolutional transformer block.rD   r   rH   rF   	mlp_ratioc                     t                      j        di | t          j                            t          ||z            t          |j                  dd          | _        || _	        d S )Ngelur   )r   ro   
activationrN   r"   )
r*   r+   r   rP   r   rF   r
   rz   r   rH   )r,   rD   rH   r   r-   r.   s        r$   r+   zTFCvtIntermediate.__init__  so    ""6"""\''i)+,,.v/GHH	 ( 
 

 #r#   rY   r0   rU   c                0    |                      |          }|S r?   )r   )r,   rY   s     r$   r>   zTFCvtIntermediate.call   s    zz,//r#   Nc                    | j         rd S d| _         t          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S r   r   r_   s     r$   r^   zTFCvtIntermediate.build  r   r   )rD   r   rH   rF   r   rF   r   r?   rd   rA   s   @r$   r   r     sk        XX# # # # # #   ? ? ? ? ? ? ? ?r#   r   c                  6     e Zd ZdZd fdZdddZddZ xZS )TFCvtOutputzu
    Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
    rD   r   rH   rF   r   r   c                    t                      j        di | t          j                            |t          |j                  d          | _        t          j                            |          | _	        || _
        || _        d S r   )r*   r+   r   rP   r   r
   rz   r   rQ   rR   rH   r   )r,   rD   rH   r   r   r-   r.   s         r$   r+   zTFCvtOutput.__init__  s|    ""6"""\''@X0Y0Y`g ( 
 

 |++I66""r#   FrY   r0   input_tensorr;   rT   rU   c                j    |                      |          }|                     ||          }||z   }|S r   r   )r,   rY   r   r;   s       r$   r>   zTFCvtOutput.call  s:    zzz66||<(|KK#l2r#   Nc           	     (   | j         rd S d| _         t          | dd           pt          j        | j        j                  5  | j                            d d t          | j        | j	        z            g           d d d            d S # 1 swxY w Y   d S d S r   )
r[   r\   r4   r]   r   rN   r^   rF   rH   r   r_   s     r$   r^   zTFCvtOutput.build!  s    : 	F
4$''3tz// U U
  $c$.4>2Q.R.R!STTTU U U U U U U U U U U U U U U U U U 43s    8BB	B	)rD   r   rH   rF   r   rF   r   rF   rc   )rY   r0   r   r0   r;   rT   rU   r0   r?   rd   rA   s   @r$   r   r     s|         # # # # # #    U U U U U U U Ur#   r   c                  :     e Zd ZdZ	 d"d# fdZd$d%dZd&d!Z xZS )'
TFCvtLayera&  
    Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
    consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
    `Block` class in the original implementation.
    TrD   r   r   rF   rH   rl   r   r   r   r   r   r   r   rT   r   r(   r   r   drop_path_rater   c                     t                      j        di | t          |||||||||	|
|||d          | _        t	          |||d          | _        t          ||||d          | _        |dk    rt          |d          n t          j
                            dd          | _        t          j
                            dd	
          | _        t          j
                            dd
          | _        || _        d S )Nr   r   intermediater   r2   	drop_pathr   rp   layernorm_beforerr   layernorm_afterr"   )r*   r+   r   r   r   r  r   r   r&   r   rP   
Activationr  r{   r  r  rH   )r,   rD   r   rH   rl   r   r   r   r   r   r   r   r   r   r  r   r-   r.   s                    r$   r+   zTFCvtLayer.__init__1  s$   & 	""6"""'!
 
 
  .fiQ_```'	9iV^___ ## .{;;;;(((DD 	 !& ? ?Se ? f f$|>>tRc>dd"r#   FrY   r0   r   r   r;   rU   c                D   |                      |                     |          |||          }|                     ||          }||z   }|                     |          }|                     |          }|                     ||          }|                     ||          }|S rW   )r   r  r  r  r  r   )r,   rY   r   r   r;   r   layer_outputs          r$   r>   zTFCvtLayer.callb  s    >>$*?*?*M*MvW\go>pp>>*:X>NN (,6 ++L99((66 ((|DD~~lX~FFr#   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j	        j                  5  | j	                            d            d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j
        j                  5  | j
                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r  r   r  r  r  )r[   r\   r4   r]   r   rN   r^   r  r   r  r  rH   r  r_   s     r$   r^   zTFCvtLayer.builds  s   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4..:t0566 . .!''---. . . . . . . . . . . . . . .4..:t0566 . .!''---. . . . . . . . . . . . . . .4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4+T22>t49:: J J%++T4,HIIIJ J J J J J J J J J J J J J J4*D11=t3899 I I$**D$+GHHHI I I I I I I I I I I I I I I I I I >=sl    A''A+.A+!CCCD))D-0D-#F

FF#G33G7:G7-#II!$I!r   )rD   r   r   rF   rH   rF   rl   rF   r   rF   r   rF   r   rF   r   rF   r   r   r   rT   r   r(   r   r(   r   r(   r  r(   r   rT   rc   r   r?   rd   rA   s   @r$   r  r  *  s         ,  $!/# /# /# /# /# /# /#b    "I I I I I I I Ir#   r  c                  6     e Zd ZdZd fdZdddZddZ xZS )
TFCvtStageaK  
    Cvt stage (encoder block). Each stage has 2 parts :
    - (1) A Convolutional Token Embedding layer
    - (2) A Convolutional Transformer Block (layer).
    The classification token is added only in the last stage.

    Args:
        config ([`CvtConfig`]): Model configuration class.
        stage (`int`): Stage number.
    rD   r   stagerF   c           
     $     t                      j        di |  _        | _         j        j         j                 rH                     dd j        j        d         ft           j        j                  dd           _        t           j        j
         j                  j        dk    rj        nj         j        dz
           j         j                 j         j                 j         j                 j         j                 d           _        t!          j        d	j         j                 j        |                   d
 D              fdt)          j         j                           D              _        d S )Nr   r   Tzcvt.encoder.stages.2.cls_token)r5   initializer	trainablerN   r   	embedding)rE   rG   rI   rH   rJ   rK   rN   r2   c                Z    g | ](}|                                                                 )S r"   )numpyitem).0r/   s     r$   
<listcomp>z'TFCvtStage.__init__.<locals>.<listcomp>  s*    EEE17799>>++EEEr#   c                   g | ]}t          fj        j                 j        j                 j        j                 j        j                 j        j                 j        j                 j        j                 j	        j                 j
        j                 j        j                 j        j                 j        j                 j                 j        j                 d | dS )zlayers.)r   rH   rl   r   r   r   r   r   r   r   r   r   r  r   rN   )r  r   r  rH   
kernel_qkvr   r   r   r   r   r   r   r   r   r   )r  jrD   drop_path_ratesr,   s     r$   r  z'TFCvtStage.__init__.<locals>.<listcomp>  s     
 
 
& %  *4:6 *4:6"-dj94 *4:6 *4:6!,TZ8&,&B4:&N4$*$>tz$J *4:6 *4:6.tz:%/
;"q]]!  
 
 
r#   r"   )r*   r+   rD   r  r   
add_weightrH   r
   rz   rC   patch_sizesrG   patch_stridepatch_paddingr   r  r4   linspacer  depthrangerP   )r,   rD   r  r-   r  r.   s   ``  @r$   r+   zTFCvtStage.__init__  s   ""6"""
; , 	!__!T[2267+DK,IJJ5	 -  DN )K)$*504
a,,VEUVZV`cdVdEe&tz2&tz2(4)$*5	
 	
 	
 +c6+@+Lfl[`NabbEE_EEE
 
 
 
 
 
& 6<
344'
 
 
r#   FrY   r0   r;   rT   c                   d }|                      ||          }t          |          \  }}}}||z  }t          j        ||||f          }| j        j        | j                 r4t          j        | j        |d          }t          j        ||fd          }| j	        D ]}	 |	||||          }
|
}| j        j        | j                 rt          j
        |d||z  gd          \  }}t          j        |||||f          }||fS )Nr}   r   )repeatsr   r   r   rX   )r  r   r4   r   rD   r   r  repeatr   rP   r   )r,   rY   r;   r   r   r   r   rG   r   layerlayer_outputss              r$   r>   zTFCvtStage.call  s/   	~~lH== 3=\2J2J/
FE<unz,z;P\6]^^^; , 	H	$.*1MMMI9i%>QGGGL[ 	) 	)E!E,QQQM(LL; , 	U&(h|a%=PRS&T&T#I| z,z65R^6_```Y&&r#   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTr  rP   )r[   r\   r4   r]   r  rN   r^   rP   r,   r`   r'  s      r$   r^   zTFCvtStage.build  sY   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +44((4 & &]5:.. & &KK%%%& & & & & & & & & & & & & & & 54& &s$    A''A+.A+%CC	C	)rD   r   r  rF   rc   )rY   r0   r;   rT   r?   rd   rA   s   @r$   r  r    su        	 	-
 -
 -
 -
 -
 -
^' ' ' ' '0
& 
& 
& 
& 
& 
& 
& 
&r#   r  c                  @     e Zd ZdZeZd fdZ	 	 	 dddZddZ xZ	S )TFCvtEncoderz
    Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
    (depth) being 1, 2 and 10.

    Args:
        config ([`CvtConfig`]): Model configuration class.
    rD   r   c                     t                      j        di | | _        fdt          t	          j                            D             | _        d S )Nc                :    g | ]}t          |d |           S )zstages.r   )r  )r  	stage_idxrD   s     r$   r  z)TFCvtEncoder.__init__.<locals>.<listcomp>  s>     
 
 
JSJvy/D/D/DEEE
 
 
r#   r"   )r*   r+   rD   r#  r6   r"  stagesr,   rD   r-   r.   s    ` r$   r+   zTFCvtEncoder.__init__  sk    ""6"""
 
 
 
W\]`agam]n]nWoWo
 
 
r#   FTrS   r   output_hidden_statesbool | Nonereturn_dictr;   rU   0TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]c                h   |rdnd }|}t          j        |d          }d }t          | j                  D ]\  }}	 |	||          \  }}|r||fz   }t          j        |d          }|rt	          d |D                       }|st	          d |||fD                       S t          |||          S )	Nr"   )r   r   r   r   r   rX   r   r   r   r   c              3  B   K   | ]}t          j        |d           V  dS )r7  r   N)r4   r   )r  hss     r$   	<genexpr>z$TFCvtEncoder.call.<locals>.<genexpr>  s1      %f%fbbl2L&I&I&I%f%f%f%f%f%fr#   c              3     K   | ]}||V  	d S r?   r"   )r  vs     r$   r:  z$TFCvtEncoder.call.<locals>.<genexpr>  s(      bbqTUTaTaTaTaTabbr#   r   r   r   )r4   r   	enumerater0  tupler   )
r,   rS   r2  r4  r;   all_hidden_statesrY   r   r   stage_modules
             r$   r>   zTFCvtEncoder.call  s    #7@BBD# |L|DDD	!*4;!7!7 	H 	HA&2l<(&S&S&S#L)# H$5$G! |L|DDD 	g %%f%fTe%f%f%f f f 	cbb\9>O$Pbbbbbb,*%+
 
 
 	
r#   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTr0  )r[   r\   r0  r4   r]   rN   r^   r*  s      r$   r^   zTFCvtEncoder.build  s    : 	F
44((4 & &]5:.. & &KK%%%& & & & & & & & & & & & & & & 54& &s   A&&A*	-A*	rD   r   )FTF)
rS   r   r2  r3  r4  r3  r;   r3  rU   r5  r?   )
r   r   r   r    r   config_classr+   r>   r^   r@   rA   s   @r$   r,  r,    s          L
 
 
 
 
 
 -2#' %
 
 
 
 
B& & & & & & & &r#   r,  c                  R     e Zd ZdZeZd fdZe	 	 	 	 ddd            ZddZ	 xZ
S )TFCvtMainLayerzConstruct the Cvt model.rD   r   c                t     t                      j        di | || _        t          |d          | _        d S )Nencoderr   r"   )r*   r+   rD   r,  rH  r1  s      r$   r+   zTFCvtMainLayer.__init__.  s?    ""6"""#F;;;r#   NFrS   TFModelInputType | Noner2  r3  r4  r;   rU   r5  c                    |t          d          |                     ||||          }|d         }|s|f|dd          z   S t          ||j        |j                  S )N You have to specify pixel_valuesr2  r4  r;   r   r   r=  )
ValueErrorrH  r   r   r   )r,   rS   r2  r4  r;   encoder_outputssequence_outputs          r$   r>   zTFCvtMainLayer.call3  s     ?@@@,,!5#	 ' 
 
 *!, 	<#%(;;;,-+;)7
 
 
 	
r#   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTrH  )r[   r\   r4   r]   rH  rN   r^   r_   s     r$   r^   zTFCvtMainLayer.buildP  s    : 	F
4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) 65rb   rC  NNNF)
rS   rI  r2  r3  r4  r3  r;   r3  rU   r5  r?   )r   r   r   r    r   rD  r+   r   r>   r^   r@   rA   s   @r$   rF  rF  (  s        ""L< < < < < <
  15,0#' %
 
 
 
 ]
8) ) ) ) ) ) ) )r#   rF  c                      e Zd ZdZeZdZdZdS )TFCvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrS   N)r   r   r   r    r   rD  base_model_prefixmain_input_namer"   r#   r$   rS  rS  Y  s*         
 L$OOOr#   rS  a  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    </Tip>

    Args:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
al  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fdZe ee           eee	          	 	 	 	 ddd                                    Z
ddZ xZS )
TFCvtModelrD   r   c                n     t                      j        |g|R i | t          |d          | _        d S )NrT  r   )r*   r+   rF  rT  r,   rD   r   r-   r.   s       r$   r+   zTFCvtModel.__init__  sB    3&333F333!&u555r#   output_typerD  NFrS   r   r2  r3  r4  r;   rU   r5  c                    |t          d          |                     ||||          }|s|d         f|dd         z   S t          |j        |j        |j                  S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NrK  )rS   r2  r4  r;   r   r   r=  )rM  rT  r   r   r   r   )r,   rS   r2  r4  r;   outputss         r$   r>   zTFCvtModel.call  s    > ?@@@((%!5#	  
 
  	/AJ=7122;..,%7#3!/
 
 
 	
r#   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTrT  )r[   r\   r4   r]   rT  rN   r^   r_   s     r$   r^   zTFCvtModel.build  s    : 	F
4%%1tx}-- % %t$$$% % % % % % % % % % % % % % % % % % 21rb   rC  rQ  )
rS   r   r2  r3  r4  r3  r;   r3  rU   r5  r?   )r   r   r   r+   r   r   TFCVT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr>   r^   r@   rA   s   @r$   rX  rX    s        
6 6 6 6 6 6
 **+ABB+HWfggg *.,0#' %-
 -
 -
 -
 hg CB ]-
^% % % % % % % %r#   rX  z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       e Zd Zd fdZe ee           eee	          	 	 	 	 	 ddd                                    Z
ddZ xZS )TFCvtForImageClassificationrD   r   c                `    t                      j        |g|R i | |j        | _        t          |d          | _        t
          j                            dd          | _        t
          j        	                    |j        t          |j                  ddd	          | _        || _        d S )
NrT  r   rp   	layernormrr   Tr   
classifierr   )r*   r+   
num_labelsrF  rT  r   rP   r{   re  r   r
   rz   rf  rD   rZ  s       r$   r+   z$TFCvtForImageClassification.__init__  s    3&333F333 +!&u55588K8XX  ,,,#.v/GHH$ - 
 
 r#   r[  NFrS   r   labelsr2  r3  r4  r;   rU   9TFImageClassifierOutputWithNoAttention | tuple[tf.Tensor]c                H   |                      ||||          }|d         }|d         }| j        j        d         r|                     |          }n[t	          |          \  }	}
}}t          j        ||	|
||z  f          }t          j        |d          }|                     |          }t          j        |d          }| 	                    |          }|d	n| 
                    ||
          }|s|f|dd	         z   }||f|z   n|S t          |||j                  S )a+  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
        ```rL  r   r   r   r}   )r   r   r   r   r   N)rh  r   r   )lossr   r   )rT  rD   r   re  r   r4   r   r   reduce_meanrf  hf_compute_lossr   r   )r,   rS   rh  r2  r4  r;   r^  rO  r   r   rG   r   r   sequence_output_meanr   rk  r   s                    r$   r>   z TFCvtForImageClassification.call  sW   R ((!5#	  
 
 "!*AJ	; $ 	>"nnY77OO 7A6Q6Q3Jfe j\[adi[i@jkkkO l?KKKO"nn_==O!~oAFFF!566~tt4+?+?vV\+?+]+] 	FY,F)-)9TGf$$vE54^e^sttttr#   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           ct          j        | j        j                  5  | j                            d d | j        j	        d         g           d d d            n# 1 swxY w Y   t          | dd           {t          | j        d          rht          j        | j        j                  5  | j                            d d | j        j	        d         g           d d d            d S # 1 swxY w Y   d S d S d S )NTrT  re  r   rf  rN   )r[   r\   r4   r]   rT  rN   r^   re  rD   rH   hasattrrf  r_   s     r$   r^   z!TFCvtForImageClassification.build7  sJ   : 	F
4%%1tx}-- % %t$$$% % % % % % % % % % % % % % %4d++7t~233 N N$$dD$+2G2K%LMMMN N N N N N N N N N N N N N N4t,,8t// S]4?#788 S SO))4t{7LR7P*QRRRS S S S S S S S S S S S S S S S S S 98S Ss6    A''A+.A+!.CC"C*.E%%E),E)rC  )NNNNF)rS   r   rh  r   r2  r3  r4  r3  r;   r3  rU   ri  r?   )r   r   r   r+   r   r   r`  r   r   ra  r>   r^   r@   rA   s   @r$   rc  rc    s             $ **+ABB+Q`oppp *.#',0#' %@u @u @u @u qp CB ]@uDS S S S S S S Sr#   rc  )rc  rX  rS  ):r    
__future__r   collections.abcrv   dataclassesr   
tensorflowr4   modeling_tf_outputsr   modeling_tf_utilsr   r   r	   r
   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   configuration_cvtr   
get_loggerr   loggerra  r   rP   Layerr&   rC   rO   r   r   r   r   r   r   r   r   r  r  r,  rF  rS  TFCVT_START_DOCSTRINGr`  rX  rc  __all__r"   r#   r$   <module>r     s     " " " " " "     ! ! ! ! ! !     I I I I I I                  3 2 2 2 2 2 2 2              ) ( ( ( ( ( 
	H	%	%  7 7 7 7 7K 7 7 7(/ / / / /EL& / / /(%8 %8 %8 %8 %8el( %8 %8 %8P7G 7G 7G 7G 7G%,, 7G 7G 7Gt"M "M "M "M "Mu|'9 "M "M "MJ    );   8 8 8 8 85<#5 8 8 8DMJ MJ MJ MJ MJ+ MJ MJ MJ`? ? ? ? ?el( ? ? ?27. 7. 7. 7. 7.U\' 7. 7. 7.t? ? ? ? ?* ? ? ?4U U U U U%,$ U U U:^I ^I ^I ^I ^I# ^I ^I ^IB]& ]& ]& ]& ]&# ]& ]& ]&@:& :& :& :& :&5<% :& :& :&z -) -) -) -) -)U\' -) -) -)`% % % % %, % % % 8 & c >% >% >% >% >%% >% >%	 >%B   eS eS eS eS eS"68T eS eS eSP P
O
Or#   