
     `i                       d Z ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZ  ej        e          Z dZ!dZ"g dZ#dZ$dZ%dGdHdZ& G d dej'        j(                  Z) G d dej'        j(                  Z* G d dej'        j(                  Z+ G d d ej'        j(                  Z, G d! d"ej'        j(                  Z- G d# d$ej'        j(                  Z. G d% d&ej'        j(                  Z/ G d' d(ej'        j(                  Z0 G d) d*ej'        j(                  Z1 G d+ d,ej'        j(                  Z2 G d- d.ej'        j(                  Z3 G d/ d0ej'        j(                  Z4e G d1 d2ej'        j(                              Z5 G d3 d4e          Z6d5Z7d6Z8 e	d7e7           G d8 d9e6                      Z9 e	d:e7           G d; d<e6e                      Z: G d= d>ej'        j(                  Z; G d? d@ej'        j(                  Z< G dA dBej'        j(                  Z= e	dCe7           G dD dEe6                      Z>g dFZ?dS )IzTensorFlow 2.0 MobileViT model.    )annotationsN   )get_tf_activation)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)TFBaseModelOutputTFBaseModelOutputWithPooling&TFImageClassifierOutputWithNoAttention(TFSemanticSegmenterOutputWithNoAttention)TFPreTrainedModelTFSequenceClassificationLosskeraskeras_serializableunpack_inputs)
shape_liststable_softmax)logging   )MobileViTConfigr   zapple/mobilevit-small)r   i     r   ztabby, tabby catr   valueintdivisor	min_value
int | Nonereturnc                    ||}t          |t          | |dz  z             |z  |z            }|d| z  k     r||z  }t          |          S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxr   )r   r   r   	new_values       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mobilevit/modeling_tf_mobilevit.pymake_divisibler$   >   s^     	Is57Q;#6777BWLMMI3;W	y>>    c                  @     e Zd Z	 	 	 	 	 	 dd fdZdddZddZ xZS ) TFMobileViTConvLayerr   FTconfigr   in_channelsr   out_channelskernel_sizestridegroupsbiasbooldilationuse_normalizationuse_activation
bool | strr   Nonec           
         t                      j        di | t                              d| j        j         d           t          |dz
  dz            |z  }t          j        	                    |          | _
        ||z  dk    rt          d| d| d          t          j                            |||d	|||d
          | _        |	r(t          j                            ddd          | _        nd | _        |
rkt!          |
t"                    rt%          |
          | _        nHt!          |j        t"                    rt%          |j                  | _        n|j        | _        nd | _        || _        || _        d S )N
z has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPUr   r    r   zOutput channels (z) are not divisible by z groups.VALIDconvolution)filtersr+   stridespaddingdilation_rater-   use_biasnamegh㈵>g?normalization)epsilonmomentumr>    )super__init__loggerwarning	__class____name__r   r   layersZeroPadding2Dr;   
ValueErrorConv2Dr8   BatchNormalizationr?   
isinstancestrr   
activation
hidden_actr)   r*   )selfr(   r)   r*   r+   r,   r-   r.   r0   r1   r2   kwargsr;   rG   s                r#   rD   zTFMobileViTConvLayer.__init__N   s    	""6"""E( E E E	
 	
 	

 {Q!+,,x7|11'::& A%%fffV\fffggg <.. #" / 	
 	
  	&!&!@!@X[bq!@!r!rD!%D 	#.#.. 4"3N"C"CF-s33 4"3F4E"F"F"("3"DO&(r%   features	tf.Tensortrainingc                    |                      |          }|                     |          }| j        |                     ||          }| j        |                     |          }|S NrV   )r;   r8   r?   rP   )rR   rT   rV   padded_featuress       r#   callzTFMobileViTConvLayer.call   se    ,,x00##O44)))(X)FFH?&x00Hr%   Nc                   | j         rd S d| _         t          | dd           Yt          j        | j        j                  5  | j                            d d d | j        g           d d d            n# 1 swxY w Y   t          | dd           qt          | j	        d          r^t          j        | j	        j                  5  | j	                            d d d | j
        g           d d d            d S # 1 swxY w Y   d S d S d S )NTr8   r?   r>   )builtgetattrtf
name_scoper8   r>   buildr)   hasattrr?   r*   rR   input_shapes     r#   ra   zTFMobileViTConvLayer.build   s   : 	F
4--9t/455 M M &&dD$:J'KLLLM M M M M M M M M M M M M M M4$//;t)622 T]4#5#:;; T T&,,dD$@Q-RSSST T T T T T T T T T T T T T T T T T <;T Ts$    $A00A47A4?$C00C47C4)r   r   Fr   TT)r(   r   r)   r   r*   r   r+   r   r,   r   r-   r   r.   r/   r0   r   r1   r/   r2   r3   r   r4   FrT   rU   rV   r/   r   rU   NrH   
__module____qualname__rD   r[   ra   __classcell__rG   s   @r#   r'   r'   M   s         "&%)4) 4) 4) 4) 4) 4) 4)l    
T 
T 
T 
T 
T 
T 
T 
Tr%   r'   c                  :     e Zd ZdZ	 dd fdZdddZddZ xZS )TFMobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r   r(   r   r)   r   r*   r,   r0   r   r4   c           
         t                      j        di | t          t          t	          ||j        z                      d          }|dvrt          d| d          |dk    o||k    | _        t          |||dd          | _	        t          |||d|||d	
          | _
        t          |||ddd          | _        d S )Nr   )r   r    zInvalid stride .r   
expand_1x1r)   r*   r+   r>   r   conv_3x3)r)   r*   r+   r,   r-   r0   r>   F
reduce_1x1r)   r*   r+   r2   r>   rB   )rC   rD   r$   r   roundexpand_ratiorK   use_residualr'   rq   rs   rt   )	rR   r(   r)   r*   r,   r0   rS   expanded_channelsrG   s	           r#   rD   z$TFMobileViTInvertedResidual.__init__   s    	""6"""*3u[6CV5V/W/W+X+XZ[\\8v888999#q[K{l/J.:KYZam
 
 
 -)*$	
 	
 	
 /)% 
 
 
r%   FrT   rU   rV   r/   c                    |}|                      ||          }|                     ||          }|                     ||          }| j        r||z   n|S rX   )rq   rs   rt   rx   )rR   rT   rV   residuals       r#   r[   z TFMobileViTInvertedResidual.call   s^    ??8h???==H===??8h???&*&7Ex(""XEr%   Nc                r   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTrq   rs   rt   )	r]   r^   r_   r`   rq   r>   ra   rs   rt   rc   s     r#   ra   z!TFMobileViTInvertedResidual.build   s   : 	F
4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , ,4T**6t}122 * *##D)))* * * * * * * * * * * * * * *4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , , , , , 98s6    A''A+.A+!CCCD**D.1D.r   )r(   r   r)   r   r*   r   r,   r   r0   r   r   r4   re   rf   rg   rH   ri   rj   __doc__rD   r[   ra   rk   rl   s   @r#   rn   rn      s         
 jk!
 !
 !
 !
 !
 !
 !
FF F F F F, , , , , , , ,r%   rn   c                  8     e Zd Z	 	 dd fdZdddZddZ xZS )TFMobileViTMobileNetLayerr   r(   r   r)   r   r*   r,   
num_stagesr   r4   c           	          t                      j        di | g | _        t          |          D ]=}t	          ||||dk    r|ndd|           }| j                            |           |}>d S )Nr   r   layer.)r)   r*   r,   r>   rB   )rC   rD   rI   rangern   append)
rR   r(   r)   r*   r,   r   rS   ilayerrG   s
            r#   rD   z"TFMobileViTMobileNetLayer.__init__   s     	""6"""z"" 		' 		'A/')!"avvQ!a\\  E Ku%%%&KK		' 		'r%   FrT   rU   rV   r/   c                4    | j         D ]} |||          }|S rX   rI   )rR   rT   rV   layer_modules       r#   r[   zTFMobileViTMobileNetLayer.call   s0     K 	A 	AL#|Hx@@@HHr%   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S NTrI   r]   r^   rI   r_   r`   r>   ra   rR   rd   r   s      r#   ra   zTFMobileViTMobileNetLayer.build       : 	F
44((4 $ - -]<#455 - - &&t,,,- - - - - - - - - - - - - - - 54- -   A&&A*	-A*	)r   r   )r(   r   r)   r   r*   r   r,   r   r   r   r   r4   re   rf   rg   rh   rl   s   @r#   r   r      sw         ' ' ' ' ' ' '.    
- - - - - - - -r%   r   c                  :     e Zd Zd fdZdd
ZdddZddZ xZS )TFMobileViTSelfAttentionr(   r   hidden_sizer   r   r4   c                    t                      j        d
i | ||j        z  dk    rt          d| d|j         d          |j        | _        t	          ||j        z            | _        | j        | j        z  | _        t          j        | j        t          j	                  }t          j
                            |          | _        t          j                            | j        |j        d          | _        t          j                            | j        |j        d          | _        t          j                            | j        |j        d	          | _        t          j                            |j                  | _        || _        d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rp   dtypequery)r=   r>   keyr   rB   )rC   rD   num_attention_headsrK   r   attention_head_sizeall_head_sizer_   castfloat32mathsqrtscaler   rI   Denseqkv_biasr   r   r   Dropoutattention_probs_dropout_probdropoutr   )rR   r(   r   rS   r   rG   s        r#   rD   z!TFMobileViTSelfAttention.__init__  se   ""6"""33q887; 7 737 7 7  
 $*#= #&{V5O'O#P#P !58PP0
CCCW\\%((
\''(:V_[b'cc
<%%d&86?Y^%__\''(:V_[b'cc
|++F,OPP&r%   xrU   c                    t          j        |          d         }t          j        ||d| j        | j        f          }t          j        |g d          S )Nr   shaper   r    r   r   perm)r_   r   reshaper   r   	transpose)rR   r   
batch_sizes      r#   transpose_for_scoresz-TFMobileViTSelfAttention.transpose_for_scores  sO    Xa[[^
JqR1I4Kc deee|ALLL1111r%   Fhidden_statesrV   r/   c                R   t          j        |          d         }|                     |                     |                    }|                     |                     |                    }|                     |                     |                    }t          j        ||d          }|| j        z  }t          |d          }| 	                    ||          }t          j        ||          }	t          j
        |	g d          }	t          j        |	|d| j        f	          }	|	S )
Nr   T)transpose_br   axisrY   r   r   r   )r_   r   r   r   r   r   matmulr   r   r   r   r   r   )
rR   r   rV   r   	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layers
             r#   r[   zTFMobileViTSelfAttention.call  s   Xm,,Q/
--dhh}.E.EFF	//

=0I0IJJ//

=0I0IJJ 9[)NNN+dj8 ))9CCC ,,,JJ	/;??]FFF
=RI[8\]]]r%   Nc                   | j         rd S d| _         t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j        j                  5  | j                            d d | j        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j	        j                  5  | j	                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   )
r]   r^   r_   r`   r   r>   ra   r   r   r   rc   s     r#   ra   zTFMobileViTSelfAttention.build5  s#   : 	F
4$''3tz// A A
  $d.>!?@@@A A A A A A A A A A A A A A A4%%1tx}-- ? ?dD,<=>>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ?4$''3tz// A A
  $d.>!?@@@A A A A A A A A A A A A A A A A A A 43s6    #A//A36A3)#CCC#EE	Er(   r   r   r   r   r4   )r   rU   r   rU   re   r   rU   rV   r/   r   rU   rg   )rH   ri   rj   rD   r   r[   ra   rk   rl   s   @r#   r   r     s        ' ' ' ' ' ',2 2 2 2
    0A A A A A A A Ar%   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFMobileViTSelfOutputr(   r   r   r   r   r4   c                     t                      j        di | t          j                            |d          | _        t          j                            |j                  | _        || _	        d S Ndenser>   rB   )
rC   rD   r   rI   r   r   r   hidden_dropout_probr   r   rR   r(   r   rS   rG   s       r#   rD   zTFMobileViTSelfOutput.__init__E  sd    ""6"""\''''BB
|++F,FGG&r%   Fr   rU   rV   r/   c                ^    |                      |          }|                     ||          }|S rX   r   r   )rR   r   rV   s      r#   r[   zTFMobileViTSelfOutput.callK  s.    

=11]XFFr%   Nc                    | j         rd S d| _         t          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S NTr   r]   r^   r_   r`   r   r>   ra   r   rc   s     r#   ra   zTFMobileViTSelfOutput.buildP      : 	F
4$''3tz// A A
  $d.>!?@@@A A A A A A A A A A A A A A A A A A 43    #A00A47A4r   re   r   rg   rh   rl   s   @r#   r   r   D  sr        ' ' ' ' ' '    
A A A A A A A Ar%   r   c                  8     e Zd Zd fdZd ZdddZddZ xZS )TFMobileViTAttentionr(   r   r   r   r   r4   c                     t                      j        di | t          ||d          | _        t	          ||d          | _        d S )N	attentionr   outputrB   )rC   rD   r   r   r   dense_outputr   s       r#   rD   zTFMobileViTAttention.__init__Z  sS    ""6"""1&+KXXX1&+HUUUr%   c                    t           rg   NotImplementedError)rR   headss     r#   prune_headsz TFMobileViTAttention.prune_heads_  s    !!r%   Fr   rU   rV   r/   c                b    |                      ||          }|                     ||          }|S rX   )r   r   )rR   r   rV   self_outputsattention_outputs        r#   r[   zTFMobileViTAttention.callb  s6    ~~mh~GG,,\H,MMr%   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr   r   )r]   r^   r_   r`   r   r>   ra   r   rc   s     r#   ra   zTFMobileViTAttention.buildg  sU   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:$    A''A+.A+!C		CCr   re   r   rg   )rH   ri   rj   rD   r   r[   ra   rk   rl   s   @r#   r   r   Y  s        V V V V V V
" " "         
	. 	. 	. 	. 	. 	. 	. 	.r%   r   c                  0     e Zd Zd fdZddZddZ xZS )TFMobileViTIntermediater(   r   r   r   intermediate_sizer   r4   c                    t                      j        di | t          j                            |d          | _        t          |j        t                    rt          |j                  | _
        n|j        | _
        || _        d S r   )rC   rD   r   rI   r   r   rN   rQ   rO   r   intermediate_act_fnr   rR   r(   r   r   rS   rG   s        r#   rD   z TFMobileViTIntermediate.__init__t  s    ""6"""\''(9'HH
f'-- 	9'89J'K'KD$$'-'8D$&r%   r   rU   c                Z    |                      |          }|                     |          }|S rg   )r   r   )rR   r   s     r#   r[   zTFMobileViTIntermediate.call}  s,    

=1100??r%   Nc                    | j         rd S d| _         t          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S r   r   rc   s     r#   ra   zTFMobileViTIntermediate.build  r   r   r(   r   r   r   r   r   r   r4   )r   rU   r   rU   rg   rh   rl   s   @r#   r   r   s  sm        ' ' ' ' ' '   
A A A A A A A Ar%   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFMobileViTOutputr(   r   r   r   r   r   r4   c                     t                      j        di | t          j                            |d          | _        t          j                            |j                  | _        || _	        d S r   )
rC   rD   r   rI   r   r   r   r   r   r   r   s        r#   rD   zTFMobileViTOutput.__init__  sd    ""6"""\''''BB
|++F,FGG!2r%   Fr   rU   input_tensorrV   r/   c                h    |                      |          }|                     ||          }||z   }|S rX   r   )rR   r   r   rV   s       r#   r[   zTFMobileViTOutput.call  s8    

=11]XFF%4r%   Nc                    | j         rd S d| _         t          | dd           [t          j        | j        j                  5  | j                            d d | j        g           d d d            d S # 1 swxY w Y   d S d S r   )r]   r^   r_   r`   r   r>   ra   r   rc   s     r#   ra   zTFMobileViTOutput.build  s    : 	F
4$''3tz// G G
  $d.D!EFFFG G G G G G G G G G G G G G G G G G 43r   r   re   )r   rU   r   rU   rV   r/   r   rU   rg   rh   rl   s   @r#   r   r     sr        3 3 3 3 3 3    G G G G G G G Gr%   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFMobileViTTransformerLayerr(   r   r   r   r   r   r4   c                    t                      j        di | t          ||d          | _        t	          |||d          | _        t          |||d          | _        t          j	        
                    |j        d          | _        t          j	        
                    |j        d          | _        || _        d S )	Nr   r   intermediater   layernorm_beforer@   r>   layernorm_afterrB   )rC   rD   r   r   r   r   r   mobilevit_outputr   rI   LayerNormalizationlayer_norm_epsr   r   r   r   s        r#   rD   z$TFMobileViTTransformerLayer.__init__  s    ""6"""-fkTTT3FKIZaoppp 1&+GX_g h h h % ? ?H]dv ? w w$|>>vG\ct>uu&r%   Fr   rU   rV   r/   c                    |                      |                     |          |          }||z   }|                     |          }|                     |          }|                     |||          }|S rX   )r   r   r   r   r   )rR   r   rV   r   layer_outputs        r#   r[   z TFMobileViTTransformerLayer.call  sx    >>$*?*?*N*NYa>bb(=8++M::((66,,\=S[,\\r%   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j	        j                  5  | j	                            d d | j
        g           d d d            n# 1 swxY w Y   t          | dd           [t          j        | j        j                  5  | j                            d d | j
        g           d d d            d S # 1 swxY w Y   d S d S )NTr   r   r   r   r   )r]   r^   r_   r`   r   r>   ra   r   r   r   r   r   rc   s     r#   ra   z!TFMobileViTTransformerLayer.build  sM   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4..:t0566 . .!''---. . . . . . . . . . . . . . .4+T22>t49:: 2 2%++D1112 2 2 2 2 2 2 2 2 2 2 2 2 2 24+T22>t49:: L L%++T49I,JKKKL L L L L L L L L L L L L L L4*D11=t3899 K K$**D$8H+IJJJK K K K K K K K K K K K K K K K K K >=sZ    A''A+.A+!CCCD))D-0D-##FFF#G<<H H r   re   r   rg   rh   rl   s   @r#   r   r     sr        ' ' ' ' ' '    K K K K K K K Kr%   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFMobileViTTransformerr(   r   r   r   r   r   r4   c           	          t                      j        di | g | _        t          |          D ]G}t	          ||t          ||j        z            d|           }| j                            |           Hd S )Nr   )r   r   r>   rB   )rC   rD   rI   r   r   r   	mlp_ratior   )rR   r(   r   r   rS   r   transformer_layerrG   s          r#   rD   zTFMobileViTTransformer.__init__  s    ""6"""z"" 	2 	2A ;'"%kF4D&D"E"E!a\\	! ! ! K01111	2 	2r%   Fr   rU   rV   r/   c                4    | j         D ]} |||          }|S rX   r   )rR   r   rV   r   s       r#   r[   zTFMobileViTTransformer.call  s1     K 	K 	KL(LJJJMMr%   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S r   r   r   s      r#   ra   zTFMobileViTTransformer.build  r   r   )r(   r   r   r   r   r   r   r4   re   r   rg   rh   rl   s   @r#   r   r     sj        2 2 2 2 2 2    
- - - - - - - -r%   r   c                  J     e Zd ZdZ	 dd fdZddZd dZd!d"dZd#dZ xZ	S )$TFMobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r   r(   r   r)   r   r*   r,   r   r   r0   r   r4   c           	     ^    t                      j        di | |j        | _        |j        | _        |dk    r0t          ||||dk    r|nd|dk    r|dz  ndd          | _        |}nd | _        t          ||||j        d          | _	        t          |||dddd	          | _
        t          |||d
          | _        t          j                            |j        d          | _        t          |||dd          | _        t          |d|z  ||j        d          | _        || _        d S )Nr    r   downsampling_layer)r)   r*   r,   r0   r>   conv_kxkrr   Fconv_1x1)r)   r*   r+   r1   r2   r>   transformer)r   r   r>   	layernormr   conv_projectionfusionrB   )rC   rD   
patch_sizepatch_widthpatch_heightrn   r  r'   conv_kernel_sizer  r	  r   r
  r   rI   r   r   r  r  r  r   )
rR   r(   r)   r*   r,   r   r   r0   rS   rG   s
            r#   rD   zTFMobileViTLayer.__init__  s    	""6"""!,"-Q;;&A')!)QvvA*2Q,,QA)' ' 'D# 'KK&*D#,#$/
 
 
 -#$# 
 
 
 2

 
 
 88AV]h8ii3+ST[l 
  
  
 +K$/
 
 
 'r%   rT   rU   tuple[tf.Tensor, dict]c                   | j         | j        }}t          j        ||z  d          }t          j        |          d         }t          j        |          d         }t          j        |          d         }t          j        |          d         }t          j        t          j                            ||z            |z  d          }	t          j        t          j                            ||z            |z  d          }
|
|k    p|	|k    }|r$t          j                            ||	|
fd          }|
|z  }|	|z  }||z  }t          j	        |g d          }t          j
        |||z  |z  |||f          }t          j	        |g d	          }t          j
        |||||f          }t          j	        |g d
          }t          j
        |||z  ||f          }||f||||||d}||fS )Nint32r   r   r    r   bilinearsizemethodr   r   r   r    r   r   r   r    r   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r  r  r_   r   r   r   ceilimageresizer   r   )rR   rT   r  r  
patch_arear   orig_height
orig_widthr  
new_height	new_widthr  num_patch_widthnum_patch_heightr  patches	info_dicts                    r#   	unfoldingzTFMobileViTLayer.unfolding,  s   $($4d6G\W[<7AA
Xh''*
hx((+Xh''*
8H%%a(WRW\\+*DEETV]^^
GBGLLk)ABB[PRYZZ	:-J{1J 	bxxz96MV`aaH ${2%5&8 <,,,77*zH,/??`kl
 
 ,w55*Wz8[*&UVV,w55*WzJ'>X&VWW &z2$ &&!0"2
 
	 	!!r%   r+  r,  dictc                (   | j         | j        }}t          ||z            }|d         }|d         }|d         }|d         }	|d         }
t          j        ||||df          }t          j        |d          }t          j        |||z  |	z  |
||f          }t          j        |d	          }t          j        ||||	|z  |
|z  f          }t          j        |d
          }|d         r(t          j                            ||d         d          }|S )Nr   r  r  r   r  r   r  r   r   r   r    r   r   r  r  r  r  )r  r  r   r_   r   r   r"  r#  )rR   r+  r,  r  r  r$  r   r  r  r*  r)  rT   s               r#   foldingzTFMobileViTLayer.foldingX  s=   $($4d6G\|344
|,
Z(.$%9:#$78 :g
JR'PQQ<|<<<:zH,/??R^`kl
 
 <|<<<:z8-=-Lo`kNkl
 
 <|<<<]# 	axxi6LU_``Hr%   FrV   r/   c                   | j         r|                      ||          }|}|                     ||          }|                     ||          }|                     |          \  }}|                     ||          }|                     |          }|                     ||          }|                     ||          }|                     t          j
        ||gd          |          }|S )NrY   r   r   )r  r  r	  r-  r
  r  r1  r  r  r_   concat)rR   rT   rV   r{   r+  r,  s         r#   r[   zTFMobileViTLayer.callt  s    " 	L..x(.KKH ==H=====H=== "^^H55 ""7X">>..)) <<33''8'DD;;ry(H)=BGGGRZ;[[r%   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Xt          j        | j	        j                  5  | j	                            d d | j
        g           d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )	NTr  r	  r
  r  r  r  r  )r]   r^   r_   r`   r  r>   ra   r	  r
  r  r   r  r  r  rc   s     r#   ra   zTFMobileViTLayer.build  sQ   : 	F
4T**6t}122 * *##D)))* * * * * * * * * * * * * * *4T**6t}122 * *##D)))* * * * * * * * * * * * * * *4--9t/455 - - &&t,,,- - - - - - - - - - - - - - -4d++7t~233 E E$$dD$2B%CDDDE E E E E E E E E E E E E E E4*D11=t3899 1 1$**40001 1 1 1 1 1 1 1 1 1 1 1 1 1 144((4t{/00 ( (!!$'''( ( ( ( ( ( ( ( ( ( ( ( ( ( (4-t44@t6;<< 4 4'--d3334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 A@s~    A''A+.A+!CCCD))D-0D-##FFFG33G7:G7-IIIJ66J:=J:r}   )r(   r   r)   r   r*   r   r,   r   r   r   r   r   r0   r   r   r4   )rT   rU   r   r  )r+  rU   r,  r.  r   rU   re   rf   rg   )
rH   ri   rj   r   rD   r-  r1  r[   ra   rk   rl   s   @r#   r  r    s          ?' ?' ?' ?' ?' ?' ?'B*" *" *" *"X   8    24 4 4 4 4 4 4 4r%   r  c                  8     e Zd Zd fdZ	 	 	 dddZddZ xZS )TFMobileViTEncoderr(   r   r   r4   c           
         t                      j        di | || _        g | _        dx}}|j        dk    rd}d}n|j        dk    rd}d}t          ||j        d         |j        d         ddd          }| j                            |           t          ||j        d         |j        d	         d	d
d          }| j                            |           t          ||j        d	         |j        d
         d	|j	        d         d	d          }| j                            |           |r|d	z  }t          ||j        d
         |j        d         d	|j	        d         d|d          }	| j                            |	           |r|d	z  }t          ||j        d         |j        d         d	|j	        d	         d
|d          }
| j                            |
           d S )NFr   T   r   r   zlayer.0)r)   r*   r,   r   r>   r    r   zlayer.1zlayer.2)r)   r*   r,   r   r   r>      zlayer.3)r)   r*   r,   r   r   r0   r>      zlayer.4rB   )
rC   rD   r(   rI   output_strider   neck_hidden_sizesr   r  hidden_sizes)rR   r(   rS   dilate_layer_4dilate_layer_5r0   layer_1layer_2layer_3layer_4layer_5rG   s              r#   rD   zTFMobileViTEncoder.__init__  sA   ""6""" +0/1$$!N!NN!R''!N+031!4
 
 
 	7###+031!4
 
 
 	7###"031!4+A.
 
 
 	7### 	MH"031!4+A.	
 	
 	
 	7### 	MH"031!4+A.	
 	
 	
 	7#####r%   FTr   rU   output_hidden_statesr/   return_dictrV   tuple | TFBaseModelOutputc                    |rdnd }t          | j                  D ]\  }} |||          }|r||fz   }|st          d ||fD                       S t          ||          S )NrB   rY   c              3     K   | ]}||V  	d S rg   rB   ).0vs     r#   	<genexpr>z*TFMobileViTEncoder.call.<locals>.<genexpr>  s"      XXq!-----XXr%   )last_hidden_stater   )	enumeraterI   tupler
   )rR   r   rE  rF  rV   all_hidden_statesr   r   s           r#   r[   zTFMobileViTEncoder.call  s     #7@BBD(55 	I 	IOA|(LJJJM# I$58H$H! 	YXX]4E$FXXXXXX =Pabbbbr%   Nc                    | j         rd S d| _         t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S r   r   r   s      r#   ra   zTFMobileViTEncoder.build  r   r   r(   r   r   r4   )FTF)
r   rU   rE  r/   rF  r/   rV   r/   r   rG  rg   rh   rl   s   @r#   r6  r6    s        L$ L$ L$ L$ L$ L$b &+ c c c c c(- - - - - - - -r%   r6  c                  V     e Zd ZeZdd fdZd Ze	 	 	 	 ddd            ZddZ	 xZ
S )TFMobileViTMainLayerTr(   r   expand_outputr/   c                    t                      j        di | || _        || _        t	          ||j        |j        d         ddd          | _        t          |d          | _	        | j        r/t	          ||j        d         |j        d	         d
d          | _
        t          j                            dd          | _        d S )Nr   r   r    	conv_stem)r)   r*   r+   r,   r>   encoderr   r:     r   conv_1x1_exprr   channels_firstpooler)data_formatr>   rB   )rC   rD   r(   rU  r'   num_channelsr<  rW  r6  rX  rZ  r   rI   GlobalAveragePooling2Dr\  )rR   r(   rU  rS   rG   s       r#   rD   zTFMobileViTMainLayer.__init__  s    ""6"""*-+1!4
 
 
 *&yAAA 	 4"4Q7#5a8#! ! !D l99FV]e9ffr%   c                    t           )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        r   )rR   heads_to_prunes     r#   _prune_headsz!TFMobileViTMainLayer._prune_heads4  s
    
 "!r%   NFpixel_valuestf.Tensor | NonerE  bool | NonerF  rV   r   /tuple[tf.Tensor] | TFBaseModelOutputWithPoolingc                   ||n| j         j        }||n| j         j        }t          j        |d          }|                     ||          }|                     ||||          }| j        rI|                     |d                   }t          j        |g d          }| 	                    |          }n"|d         }t          j        |g d          }d }|sN|||fn|f}	| j        s1|dd          }
t          d |
d         D                       }
|
f}
|	|
z   S |	|dd          z   S |rt          d	 |d         D                       }t          |||r|n|j        
          S )Nr0  r   rY   rE  rF  rV   r   r  r   c              3  B   K   | ]}t          j        |d           V  dS r  r   Nr_   r   rJ  hs     r#   rL  z,TFMobileViTMainLayer.call.<locals>.<genexpr>g  sB       2 2;<BL6662 2 2 2 2 2r%   c              3  B   K   | ]}t          j        |d           V  dS rj  rk  rl  s     r#   rL  z,TFMobileViTMainLayer.call.<locals>.<genexpr>q  s1      !a!a",q|"D"D"D!a!a!a!a!a!ar%   )rM  pooler_outputr   )r(   rE  use_return_dictr_   r   rW  rX  rU  rZ  r\  rO  r   r   )rR   rc  rE  rF  rV   embedding_outputencoder_outputsrM  pooled_outputr   remaining_encoder_outputsr   s               r#   r[   zTFMobileViTMainLayer.call;  s    %9$D  $+Jj 	 &1%<kk$+B]
 |L|DDD>>,>JJ,,3GU`ks ' 
 
  	! $ 1 1/!2D E E !#->\\\ R R R !KK(9::MM / 2 "->\\\ R R R M 	4;H;T'77[lZnF % 4,;ABB,?),1 2 2@YZ[@\2 2 2 - -) .G,H) 999 333   	b!!a!ao^_N`!a!a!aaaM+/'+?b--_Eb
 
 
 	
r%   c                8   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           Rt          j        | j        j                  5  | j                            g d           d d d            n# 1 swxY w Y   t          | dd           St          j        | j	        j                  5  | j	                            d            d d d            d S # 1 swxY w Y   d S d S )NTrW  rX  r\  NNNNrZ  )
r]   r^   r_   r`   rW  r>   ra   rX  r\  rZ  rc   s     r#   ra   zTFMobileViTMainLayer.buildy  s   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )44((4t{/00 < <!!":":":;;;< < < < < < < < < < < < < < <4..:t0566 . .!''---. . . . . . . . . . . . . . . . . . ;:sH    A''A+.A+!CCCD++D/2D/%FFFTr(   r   rU  r/   NNNF
rc  rd  rE  re  rF  re  rV   r/   r   rf  rg   )rH   ri   rj   r   config_classrD   rb  r   r[   ra   rk   rl   s   @r#   rT  rT    s        "Lg g g g g g g6" " "  *.,0#';
 ;
 ;
 ;
 ];
z. . . . . . . .r%   rT  c                      e Zd ZdZeZdZdZdS )TFMobileViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	mobilevitrc  N)rH   ri   rj   r   r   r{  base_model_prefixmain_input_namerB   r%   r#   r}  r}    s*         
 #L#$OOOr%   r}  a	  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]`, `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
zWThe bare MobileViT model outputting raw hidden-states without any specific head on top.c            	           e Zd Zdd fdZe ee           eee	e
de          	 	 	 	 ddd                                    ZddZ xZS )TFMobileViTModelTr(   r   rU  r/   c                     t                      j        |g|R i | || _        || _        t	          ||d          | _        d S )Nr~  rU  r>   )rC   rD   r(   rU  rT  r~  )rR   r(   rU  inputsrS   rG   s        r#   rD   zTFMobileViTModel.__init__  sT    3&333F333*-fMXcdddr%   vision)
checkpointoutput_typer{  modalityexpected_outputNFrc  rd  rE  re  rF  rV   r   rf  c                8    |                      ||||          }|S rX   )r~  )rR   rc  rE  rF  rV   r   s         r#   r[   zTFMobileViTModel.call  s$      .BKZbccr%   c                    | j         rd S d| _         t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr~  )r]   r^   r_   r`   r~  r>   ra   rc   s     r#   ra   zTFMobileViTModel.build  s    : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + + + + + 87s    A((A,/A,rw  rx  ry  rz  rg   )rH   ri   rj   rD   r   r   MOBILEVIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr[   ra   rk   rl   s   @r#   r  r    s        
e e e e e e e **+EFF&0$.   *.,0#'     GF ]+ + + + + + + +r%   r  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                       e Zd Zd fdZe ee           eee	e
e          	 	 	 	 	 ddd                                    ZddZ xZS )!TFMobileViTForImageClassificationr(   r   r   r4   c                j    t                      j        |g|R i | |j        | _        t          |d          | _        t
          j                            |j                  | _	        |j        dk    r&t
          j        
                    |j        d          nt          j        | _        || _        d S )Nr~  r   r   
classifier)rC   rD   
num_labelsrT  r~  r   rI   r   classifier_dropout_probr   r   r_   identityr  r(   )rR   r(   r  rS   rG   s       r#   rD   z*TFMobileViTForImageClassification.__init__  s    3&333F333 +-f;GGG |++F,JKKHNHY\]H]H]ELv0|DDDcecn 	 r%   )r  r  r{  r  NFrc  rd  rE  re  labelsrF  rV   .tuple | TFImageClassifierOutputWithNoAttentionc                b   ||n| j         j        }|                     ||||          }|r|j        n|d         }|                     |                     ||                    }|dn|                     ||          }	|s|f|dd         z   }
|	|	f|
z   n|
S t          |	||j                  S )a  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrh  r   rY   )r  logitsr    lossr  r   )	r(   rp  r~  ro  r  r   hf_compute_lossr   r   )rR   rc  rE  r  rF  rV   outputsrs  r  r  r   s              r#   r[   z&TFMobileViTForImageClassification.call  s    , &1%<kk$+B]../CQ\go ! 
 
 2=L--'!*mh!O!OPP~tt4+?+?vV\+?+]+] 	FY,F)-)9TGf$$vE54^e^sttttr%   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           {t          | j        d          rht          j        | j        j                  5  | j                            d d | j	        j
        d         g           d d d            d S # 1 swxY w Y   d S d S d S )NTr~  r  r>   r   )r]   r^   r_   r`   r~  r>   ra   rb   r  r(   r<  rc   s     r#   ra   z'TFMobileViTForImageClassification.build3  s   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4t,,8t// []4?#788 [ [O))4t{7TUW7X*YZZZ[ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ 98[ [s$    A''A+.A+6.C11C58C5rR  NNNNF)rc  rd  rE  re  r  rd  rF  re  rV   re  r   r  rg   )rH   ri   rj   rD   r   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr[   ra   rk   rl   s   @r#   r  r    s              **+EFF*:$4	   *.,0#'#' %u u u u  GF ]u>
[ 
[ 
[ 
[ 
[ 
[ 
[ 
[r%   r  c                  2     e Zd Zd fdZdddZddZ xZS )TFMobileViTASPPPoolingr(   r   r)   r   r*   r   r4   c           
          t                      j        di | t          j                            dd          | _        t          |||ddddd          | _        d S )	NTglobal_pool)keepdimsr>   r   relur	  )r)   r*   r+   r,   r1   r2   r>   rB   )rC   rD   r   rI   r_  r  r'   r	  )rR   r(   r)   r*   rS   rG   s        r#   rD   zTFMobileViTASPPPooling.__init__A  sp    ""6""" <>>S`>aa,#%"!	
 	
 	
r%   FrT   rU   rV   r/   c                    t          |          dd         }|                     |          }|                     ||          }t          j                            ||d          }|S )Nr   r   rY   r  r  )r   r  r	  r_   r"  r#  )rR   rT   rV   spatial_sizes       r#   r[   zTFMobileViTASPPPooling.callQ  s^    !(++AbD1##H--==H===8??8,z?RRr%   Nc                   | j         rd S d| _         t          | dd           Rt          j        | j        j                  5  | j                            g d           d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  rv  r	  )r]   r^   r_   r`   r  r>   ra   r	  rc   s     r#   ra   zTFMobileViTASPPPooling.buildX  sm   : 	F
4--9t/455 A A &&'?'?'?@@@A A A A A A A A A A A A A A A4T**6t}122 * *##D)))* * * * * * * * * * * * * * * * * * 76s$    A))A-0A-#CCC)r(   r   r)   r   r*   r   r   r4   re   rf   rg   rh   rl   s   @r#   r  r  @  sj        
 
 
 
 
 
     	* 	* 	* 	* 	* 	* 	* 	*r%   r  c                  6     e Zd ZdZd fdZdddZddZ xZS )TFMobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r(   r   r   r4   c           	         t                      j        di | j        d         j        t	          j                  dk    rt          d          g | _        t          ddd          }| j        	                    |           | j        
                    fdt          j                  D                        t          d	t	          j                  dz    
          }| j        	                    |           t          dz  ddd          | _        t          j                            j                  | _        d S )Nr   z"Expected 3 values for atrous_ratesr   r  zconvs.0ru   c                N    g | ]!\  }}t          d |dd|dz              "S )r   r  convs.r   )r)   r*   r+   r0   r2   r>   )r'   )rJ  r   rater(   r)   r*   s      r#   
<listcomp>z,TFMobileViTASPP.__init__.<locals>.<listcomp>  s_        At % +!- !!#))!a%))    r%   r  r   r:  projectrB   )rC   rD   r<  aspp_out_channelslenatrous_ratesrK   convsr'   r   extendrN  r  r  r   rI   r   aspp_dropout_probr   )rR   r(   rS   in_projection
pool_layerr)   r*   rG   s    `   @@r#   rD   zTFMobileViTASPP.__init__i  s   ""6""".r2/v"##q((ABBB
,#%!
 
 
 	
-(((
       ))<==  	
 	
 	
 ,K4[SAT=U=UXY=Y4[4[
 
 

 	
*%%%+L(%!
 
 
 |++F,DEEr%   FrT   rU   rV   r/   c                   t          j        |g d          }g }| j        D ]"}|                     |||                     #t          j        |d          }|                     ||          }|                     ||          }|S )Nr0  r   rY   r   r   )r_   r   r  r   r3  r  r   )rR   rT   rV   pyramidconvpooled_featuress         r#   r[   zTFMobileViTASPP.call  s     <|||<<<J 	> 	>DNN448<<<====)G"---,,w,BB,,,JJr%   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           P| j        D ]J}t          j        |j                  5  |                    d            d d d            n# 1 swxY w Y   Id S d S )NTr  r  )r]   r^   r_   r`   r  r>   ra   r  )rR   rd   r  s      r#   ra   zTFMobileViTASPP.build  sY   : 	F
4D))5t|011 ) )""4((() ) ) ) ) ) ) ) ) ) ) ) ) ) )4$''3
 % %]49-- % %JJt$$$% % % % % % % % % % % % % % % 43% %s$    A''A+.A+%CC	C	rR  re   rf   rg   r~   rl   s   @r#   r  r  d  s{         2F 2F 2F 2F 2F 2Fh    
% 
% 
% 
% 
% 
% 
% 
%r%   r  c                  6     e Zd ZdZd fdZdddZddZ xZS )TFMobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r(   r   r   r4   c           
         t                      j        di | t          |d          | _        t          j                            |j                  | _        t          ||j
        |j        ddddd          | _        d S )	Nasppr   r   FTr  )r)   r*   r+   r1   r2   r.   r>   rB   )rC   rD   r  r  r   rI   r   r  r   r'   r  r  r  rR   r(   rS   rG   s      r#   rD   zTFMobileViTDeepLabV3.__init__  s    ""6"""#F888	|++F,JKK.0*# 	
 	
 	
r%   Fr   rU   rV   r/   c                    |                      |d         |          }|                     ||          }|                     ||          }|S )Nr   rY   )r  r   r  )rR   r   rV   rT   s       r#   r[   zTFMobileViTDeepLabV3.call  sK    99]2.9BB<<8<<<??8h???r%   Nc                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr  r  )r]   r^   r_   r`   r  r>   ra   r  rc   s     r#   ra   zTFMobileViTDeepLabV3.build  sP   : 	F
4&&2ty~.. & &	%%%& & & & & & & & & & & & & & &4t,,8t344 , ,%%d+++, , , , , , , , , , , , , , , , , , 98r   rR  re   r   rg   r~   rl   s   @r#   r  r    st         
 
 
 
 
 
"    	, 	, 	, 	, 	, 	, 	, 	,r%   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       e Zd Zd fdZd Ze ee           ee	e
          	 	 	 	 	 ddd                                    ZddZ xZS )"TFMobileViTForSemanticSegmentationr(   r   r   r4   c                     t                      j        |fi | |j        | _        t          |dd          | _        t          |d          | _        d S )NFr~  r  segmentation_headr   )rC   rD   r  rT  r~  r  r  r  s      r#   rD   z+TFMobileViTForSemanticSegmentation.__init__  s_    **6*** +-fEP[\\\!5fCV!W!W!Wr%   c                     t          |          dd          }t          j                            ||d          }t          j                            dd           fd} |||          S )Nr   r  r  Tnone)from_logits	reductionc                     | |          }t          j        | j        j        k    |j                  }||z  }t          j        |          t          j        |          z  }t          j        |d          S )Nr   r}   )r_   r   r(   semantic_loss_ignore_indexr   
reduce_sumr   )realpredunmasked_lossmaskmasked_lossreduced_masked_lossloss_fctrR   s         r#   r  zGTFMobileViTForSemanticSegmentation.hf_compute_loss.<locals>.masked_loss  st    $HT400M744;#IIQ^QdeeeD'$.K #%-"<"<r}T?R?R"R:14888r%   )r   r_   r"  r#  r   lossesSparseCategoricalCrossentropy)rR   r  r  label_interp_shapeupsampled_logitsr  r  s   `     @r#   r  z2TFMobileViTForSemanticSegmentation.hf_compute_loss  s     (//38??68JS]?^^<==$Z`=aa	9 	9 	9 	9 	9 	9 {6#3444r%   )r  r{  NFrc  rd  r  rE  re  rF  rV   r/   0tuple | TFSemanticSegmenterOutputWithNoAttentionc                   ||n| j         j        }||n| j         j        }|| j         j        dk    st	          d          |                     |d||          }|r|j        n|d         }|                     ||          }d}	||                     ||          }	t          j
        |g d	          }|s)|r|f|dd         z   }
n|f|d
d         z   }
|	|	f|
z   n|
S t          |	||r|j        nd          S )aK  
        labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFMobileViTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = TFMobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="tf")

        >>> outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrh  rY   )r  r  r  r   r    r  )r(   rE  rp  r  rK   r~  r   r  r  r_   r   r   )rR   rc  r  rE  rF  rV   r  encoder_hidden_statesr  r  r   s              r#   r[   z'TFMobileViTForSemanticSegmentation.call  sj   N %9$D  $+Jj 	 &1%<kk$+B]dk&<q&@&@NOOO..!%#	 ! 
 
 :E T 5 5'RS*''(='QQ''vf'EED f<<<888 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE73GQ'//T
 
 
 	
r%   c                   | j         rd S d| _         t          | dd           Pt          j        | j        j                  5  | j                            d            d d d            n# 1 swxY w Y   t          | dd           St          j        | j        j                  5  | j                            d            d d d            d S # 1 swxY w Y   d S d S )NTr~  r  )r]   r^   r_   r`   r~  r>   ra   r  rc   s     r#   ra   z(TFMobileViTForSemanticSegmentation.buildO  sW   : 	F
4d++7t~233 + +$$T***+ + + + + + + + + + + + + + +4,d33?t5:;; 3 3&,,T2223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 @?r   rR  r  )rc  rd  r  rd  rE  re  rF  re  rV   r/   r   r  rg   )rH   ri   rj   rD   r  r   r   r  r	   r   r  r[   ra   rk   rl   s   @r#   r  r    s        X X X X X X5 5 5( **+EFF+Sbqrrr *.#',0#'I
 I
 I
 I
 sr GF ]I
V	3 	3 	3 	3 	3 	3 	3 	3r%   r  )r  r  r  r}  )r   N)r   r   r   r   r   r   r   r   )@r   
__future__r   
tensorflowr_   activations_tfr   
file_utilsr   r   r   r	   modeling_tf_outputsr
   r   r   r   modeling_tf_utilsr   r   r   r   r   tf_utilsr   r   utilsr   configuration_mobilevitr   
get_loggerrH   rE   r  r  r  r  r  r$   rI   Layerr'   rn   r   r   r   r   r   r   r   r   r  r6  rT  r}  MOBILEVIT_START_DOCSTRINGr  r  r  r  r  r  r  __all__rB   r%   r#   <module>r     sK  " & % " " " " " "     / / / / / /                                    3 2 2 2 2 2 2 2       4 4 4 4 4 4 
	H	%	% $ . '  2 1     JT JT JT JT JT5<- JT JT JTZ=, =, =, =, =,%,"4 =, =, =,@$- $- $- $- $- 2 $- $- $-N@A @A @A @A @Au|1 @A @A @AFA A A A AEL. A A A*. . . . .5<- . . .4A A A A Ael0 A A A0G G G G G* G G G,%K %K %K %K %K%,"4 %K %K %KP- - - - -U\/ - - -:4 4 4 4 4u|) 4 4 4Dj- j- j- j- j-+ j- j- j-Z r. r. r. r. r.5<- r. r. r.j% % % % %!2 % % %' R   ] !+ !+ !+ !+ !+1 !+ !+	 !+H   ?[ ?[ ?[ ?[ ?[(BD` ?[ ?[ ?[D!* !* !* !* !*U\/ !* !* !*HP% P% P% P% P%el( P% P% P%f%, %, %, %, %,5<- %, %, %,P  	 s3 s3 s3 s3 s3)C s3 s3 s3l  r%   