
     `i              	          d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej        e          Zd=dededee         defdZ ed           ed          fdedededefdZ G d dej                  Z  G d dej                  Z! G d dej                  Z" G d d ej                  Z# G d! d"ej                  Z$ G d# d$ej                  Z% G d% d&ej                  Z& G d' d(e          Z' G d) d*ej                  Z(e G d+ d,e                      Z)e G d- d.e)                      Z* ed/0           G d1 d2e)                      Z+ G d3 d4ej                  Z, G d5 d6ej                  Z- G d7 d8ej                  Z. ed90           G d: d;e)                      Z/g d<Z0dS )>zPyTorch MobileViTV2 model.    )OptionalUnionN)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging   )MobileViTV2Config   valuedivisor	min_valuereturnc                     ||}t          |t          | |dz  z             |z  |z            }|d| z  k     r||z  }t          |          S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_values       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler   *   s^     	Is57Q;#6777BWLMMI3;W	y>>    z-infinfmin_valmax_valc                 >    t          |t          ||                     S N)r   minr   r!   r"   s      r   clipr'   9   s    wGU++,,,r   c                        e Zd Z	 	 	 	 	 	 ddededededed	ed
edededeeef         ddf fdZde	j
        de	j
        fdZ xZS )MobileViTV2ConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 n   t                                                       t          |dz
  dz            |z  }||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          t	          j        ||||||||d		  	        | _        |	rt	          j        |d
ddd          | _        nd | _        |
rjt          |
t                    rt          |
         | _        d S t          |j        t                    rt          |j                 | _        d S |j        | _        d S d | _        d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r+   r,   r-   r.   paddingr1   r/   r0   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r6   	__class__s               r   r>   zMobileViTV2ConvLayer.__init__?   sz    	{Q!+,,x71$$dddTZdddeee& A%%fffV\fffggg9#%# 

 

 

  		&!#)$(" " "D "&D 	#.#.. 4"("8F-s33 4"():";"("3"DOOOr   featuresc                     |                      |          }| j        |                     |          }| j        |                     |          }|S r$   )rA   rC   rF   )rH   rJ   s     r   forwardzMobileViTV2ConvLayer.forwardu   sO    ##H--)))(33H?&x00Hr   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   rE   r>   torchTensorrL   __classcell__rI   s   @r   r)   r)   >   s         "&+/4# 4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4# 4# 4# 4# 4#l         r   r)   c                   d     e Zd ZdZ	 ddedededededd	f fd
Zdej        dej        fdZ	 xZ
S )MobileViTV2InvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r   r*   r+   r,   r.   r1   r   Nc           	         t                                                       t          t          t	          ||j        z                      d          }|dvrt          d| d          |dk    o||k    | _        t          |||d          | _	        t          |||d|||          | _
        t          |||dd	
          | _        d S )Nr   )r   r   zInvalid stride .r   )r+   r,   r-   r   )r+   r,   r-   r.   r/   r1   Fr+   r,   r-   r3   )r=   r>   r   r   roundexpand_ratior?   use_residualr)   
expand_1x1conv_3x3
reduce_1x1)rH   r*   r+   r,   r.   r1   expanded_channelsrI   s          r   r>   z$MobileViTV2InvertedResidual.__init__   s     	*3u[6CV5V/W/W+X+XZ[\\8v888999#q[K{l/J.:KYZ
 
 
 -)*$
 
 
 /)% 
 
 
r   rJ   c                     |}|                      |          }|                     |          }|                     |          }| j        r||z   n|S r$   )r]   r^   r_   r\   )rH   rJ   residuals      r   rL   z#MobileViTV2InvertedResidual.forward   sR    ??8,,==**??8,,&*&7Ex(""XEr   )r   rM   rN   rO   __doc__r   r   r>   rQ   rR   rL   rS   rT   s   @r   rV   rV      s         
 lm
 
'
69
IL
VY
eh
	
 
 
 
 
 
BF F F F F F F F F Fr   rV   c                   `     e Zd Z	 ddedededededdf fd	Zd
ej        dej        fdZ xZ	S )MobileViTV2MobileNetLayerr   r*   r+   r,   r.   
num_stagesr   Nc                 
   t                                                       t          j                    | _        t          |          D ]9}t          ||||dk    r|nd          }| j                            |           |}:d S )Nr   r   )r+   r,   r.   )r=   r>   r   
ModuleListlayerrangerV   append)	rH   r*   r+   r,   r.   rg   irj   rI   s	           r   r>   z"MobileViTV2MobileNetLayer.__init__   s     	]__
z"" 	' 	'A/')!"avvQ	  E Je$$$&KK	' 	'r   rJ   c                 0    | j         D ]} ||          }|S r$   rj   )rH   rJ   layer_modules      r   rL   z!MobileViTV2MobileNetLayer.forward   s)     J 	. 	.L#|H--HHr   )r   r   
rM   rN   rO   r   r   r>   rQ   rR   rL   rS   rT   s   @r   rf   rf      s        qr' '''69'IL'VY'kn'	' ' ' ' ' '          r   rf   c                   T     e Zd ZdZdededdf fdZdej        dej        fdZ	 xZ
S )	MobileViTV2LinearSelfAttentionay  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://huggingface.co/papers/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r*   	embed_dimr   Nc           	         t                                                       t          ||dd|z  z   dddd          | _        t	          j        |j                  | _        t          |||dddd          | _        || _        d S )Nr   r   TF)r*   r+   r,   r0   r-   r2   r3   p)	r=   r>   r)   qkv_projr   Dropoutattn_dropoutout_projrt   )rH   r*   rt   rI   s      r   r>   z'MobileViTV2LinearSelfAttention.__init__   s    ,!a)m,# 
 
 
 J)<===,!"# 
 
 
 #r   hidden_statesc                    |                      |          }t          j        |d| j        | j        gd          \  }}}t          j        j                            |d          }|                     |          }||z  }t          j        |dd          }t          j        j        	                    |          |
                    |          z  }|                     |          }|S )Nr   )split_size_or_sectionsdimr   Tr   keepdim)rx   rQ   splitrt   r   
functionalsoftmaxrz   sumrelu	expand_asr{   )	rH   r|   qkvquerykeyr   context_scorescontext_vectorouts	            r   rL   z&MobileViTV2LinearSelfAttention.forward   s    mmM**
 "KQX\XfDgmnooosE ,44U4CC**>:: ~->r4HHH h!&&u--0H0H0O0OOmmC  
r   rc   rT   s   @r   rs   rs      s        	 	#0 #S #T # # # # # #2U\ el        r   rs   c                   \     e Zd Z	 ddededededdf
 fdZd	ej        dej        fd
Z	 xZ
S )MobileViTV2FFN        r*   rt   ffn_latent_dimffn_dropoutr   Nc           
         t                                                       t          |||ddddd          | _        t	          j        |          | _        t          |||ddddd          | _        t	          j        |          | _        d S )Nr   TF)r*   r+   r,   r-   r.   r0   r2   r3   )	r=   r>   r)   conv1r   ry   dropout1conv2dropout2)rH   r*   rt   r   r   rI   s        r   r>   zMobileViTV2FFN.__init__  s     	)!'#	
 	
 	

 
;//)&"# 	
 	
 	

 
;//r   r|   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r$   )r   r   r   r   )rH   r|   s     r   rL   zMobileViTV2FFN.forward'  sL    

=11m44

=11m44r   r   rM   rN   rO   r   r   floatr>   rQ   rR   rL   rS   rT   s   @r   r   r     s         !0 0!0 0 	0
 0 
0 0 0 0 0 0@U\ el        r   r   c                   \     e Zd Z	 ddededededdf
 fdZd	ej        dej        fd
Z	 xZ
S )MobileViTV2TransformerLayerr   r*   rt   r   dropoutr   Nc                 b   t                                                       t          j        d||j                  | _        t          ||          | _        t          j        |          | _	        t          j        d||j                  | _
        t          ||||j                  | _        d S )Nr   
num_groupsnum_channelsr9   rv   )r=   r>   r   	GroupNormlayer_norm_epslayernorm_beforers   	attentionry   r   layernorm_afterr   r   ffn)rH   r*   rt   r   r   rI   s        r   r>   z$MobileViTV2TransformerLayer.__init__0  s     	 "	W]Wl m m m7	JJ
W---!|qyV\Vklll!&)^VEWXXr   r|   c                     |                      |          }|                     |          }||z   }|                     |          }|                     |          }||z   }|S r$   )r   r   r   r   )rH   r|   layernorm_1_outattention_outputlayer_outputs        r   rL   z#MobileViTV2TransformerLayer.forward>  se    //>>>>/::(=8++M::xx--#m3r   r   r   rT   s   @r   r   r   /  s         Y Y!Y Y 	Y
 Y 
Y Y Y Y Y Y	U\ 	el 	 	 	 	 	 	 	 	r   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTV2Transformerr*   n_layersd_modelr   Nc                 8   t                                                       |j        }||z  g|z  }d |D             }t          j                    | _        t          |          D ]4}t          ||||                   }| j                            |           5d S )Nc                 8    g | ]}t          |d z  d z            S )   )r   ).0ds     r   
<listcomp>z3MobileViTV2Transformer.__init__.<locals>.<listcomp>S  s(    :::ACbB'':::r   )rt   r   )	r=   r>   ffn_multiplierr   ri   rj   rk   r   rl   )	rH   r*   r   r   r   ffn_dims	block_idxtransformer_layerrI   s	           r   r>   zMobileViTV2Transformer.__init__K  s    ."W,-8 ;::::]__
x 	1 	1I ;'(9:M! ! ! J/0000		1 	1r   r|   c                 0    | j         D ]} ||          }|S r$   ro   )rH   r|   rp   s      r   rL   zMobileViTV2Transformer.forward\  s*     J 	8 	8L(L77MMr   rq   rT   s   @r   r   r   J  s        10 1C 1# 1RV 1 1 1 1 1 1"U\ el        r   r   c                        e Zd ZdZ	 	 	 ddededededed	ed
eddf fdZdej        de	ej        e	eef         f         fdZ
dej        de	eef         dej        fdZdej        dej        fdZ xZS )MobileViTV2LayerzE
    MobileViTV2 layer: https://huggingface.co/papers/2206.02680
    r   r   r*   r+   r,   attn_unit_dimn_attn_blocksr1   r.   r   Nc                    t                                                       |j        | _        |j        | _        |}|dk    r/t          ||||dk    r|nd|dk    r|dz  nd          | _        |}nd | _        t          ||||j        |          | _	        t          |||ddd          | _
        t          |||          | _        t          j        d||j                  | _        t          |||dd	d          | _        d S )
Nr   r   )r+   r,   r.   r1   )r+   r,   r-   r/   F)r+   r,   r-   r2   r3   )r   r   r   T)r=   r>   
patch_sizepatch_widthpatch_heightrV   downsampling_layerr)   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rH   r*   r+   r,   r   r   r1   r.   cnn_out_dimrI   s
            r   r>   zMobileViTV2Layer.__init__g  sO    	!,"-#Q;;&A')!)QvvA*2Q,,QA' ' 'D# 'KK&*D# -#$/
 
 
 -#$# 
 
 
 2&-Zghhh TZTijjj  4#$"  
  
  
r   feature_mapc                     |j         \  }}}}t          j                            || j        | j        f| j        | j        f          }|                    ||| j        | j        z  d          }|||ffS )N)r-   r.   r   )shaper   r   unfoldr   r   reshape)rH   r   
batch_sizer+   
img_height	img_widthpatchess          r   	unfoldingzMobileViTV2Layer.unfolding  s    9D9J6
KY-&&*D,<=%t'78 ' 
 

 //*k4;LtO_;_acddY///r   r   output_sizec                     |j         \  }}}}|                    |||z  |          }t          j                            ||| j        | j        f| j        | j        f          }|S )N)r   r-   r.   )r   r   r   r   foldr   r   )rH   r   r   r   in_dimr   	n_patchesr   s           r   foldingzMobileViTV2Layer.folding  sr    4;M1
FJ	//*fz.A9MMm((#*D,<=%t'78	 ) 
 
 r   rJ   c                 l   | j         r|                      |          }|                     |          }|                     |          }|                     |          \  }}|                     |          }|                     |          }|                     ||          }|                     |          }|S r$   )r   r   r   r   r   r   r   r   )rH   rJ   r   r   s       r   rL   zMobileViTV2Layer.forward  s    " 	9..x88H ==**==**  $~~h77 ""7++..)) <<55''11r   )r   r   r   )rM   rN   rO   rd   r   r   r>   rQ   rR   tupler   r   rL   rS   rT   s   @r   r   r   b  s<         ;
 ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
 ;
 ;
 ;
 ;
 ;
z	0U\ 	0eEL%PSUXPX/<Y6Z 	0 	0 	0 	0u| %S/ el             r   r   c                   `     e Zd Zdeddf fdZ	 	 ddej        ded	edee	e
f         fd
Z xZS )MobileViTV2Encoderr*   r   Nc           	      "   t                                                       || _        t          j                    | _        d| _        dx}}|j        dk    rd}d}n|j        dk    rd}d}t          t          d|j
        z  dd          dd	          }t          d|j
        z  d
          }t          d|j
        z  d
          }t          d|j
        z  d
          }t          d|j
        z  d
          }	t          d|j
        z  d
          }
t          |||dd          }| j                            |           t          |||dd          }| j                            |           t          |||t          |j        d         |j
        z  d
          |j        d                   }| j                            |           |r|dz  }t          |||	t          |j        d         |j
        z  d
          |j        d         |          }| j                            |           |r|dz  }t          ||	|
t          |j        d         |j
        z  d
          |j        d         |          }| j                            |           d S )NFr   Tr   r       @   r&   r   r   r         i     )r+   r,   r.   rg   r   r   )r+   r,   r   r   )r+   r,   r   r   r1   )r=   r>   r*   r   ri   rj   gradient_checkpointingoutput_strider   r'   width_multiplierrf   rl   r   base_attn_unit_dimsr   )rH   r*   dilate_layer_4dilate_layer_5r1   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rI   s                   r   r>   zMobileViTV2Encoder.__init__  s   ]__
&+# +0/1$$!N!NN!R''!N$rF33RLLLVWce
 
 
 %R&*A%A2NNN$S6+B%BANNN$S6+B%BANNN$S6+B%BANNN$S6+B%BANNN+#$
 
 
 	
'"""+#$
 
 
 	
'""""#$()CA)FI`)`jklll .q1
 
 
 	
'""" 	MH"#$()CA)FI`)`jklll .q1
 
 
 	
'""" 	MH"#$()CA)FI`)`jklll .q1
 
 
 	
'"""""r   FTr|   output_hidden_statesreturn_dictc                     |rdnd }t          | j                  D ]\  }} ||          }|r||fz   }|st          d ||fD                       S t          ||          S )N c              3      K   | ]}||V  	d S r$   r   )r   vs     r   	<genexpr>z-MobileViTV2Encoder.forward.<locals>.<genexpr>5  s"      XXq!-----XXr   )last_hidden_stater|   )	enumeraterj   r   r
   )rH   r|   r   r   all_hidden_statesrm   rp   s          r   rL   zMobileViTV2Encoder.forward&  s     #7@BBD(44 	I 	IOA|(L77M# I$58H$H! 	YXX]4E$FXXXXXX-]noooor   )FT)rM   rN   rO   r   r>   rQ   rR   rP   r   r   r
   rL   rS   rT   s   @r   r   r     s        O#0 O#T O# O# O# O# O# O#h &+ 	p p|p #p 	p
 
u44	5p p p p p p p pr   r   c                   F    e Zd ZU eed<   dZdZdZdgZde	j
        ddfd	ZdS )
MobileViTV2PreTrainedModelr*   mobilevitv2pixel_valuesTr   moduler   Nc                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNg      ?)rD   r   Linearr@   rB   weightdatanormal_r*   initializer_ranger0   zero_r   fill_)rH   r	  s     r   _init_weightsz(MobileViTV2PreTrainedModel._init_weightsB  s    fry")R^DEE 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r   )rM   rN   rO   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler  r   r   r   r  r  :  s`         %$O&*#+,
*BI 
*$ 
* 
* 
* 
* 
* 
*r   r  c                        e Zd Zddedef fdZd Ze	 	 	 ddee	j
                 dee         d	ee         d
eeef         fd            Z xZS )MobileViTV2ModelTr*   expand_outputc           	      J   t                                          |           || _        || _        t	          t          d|j        z  dd          dd          }t          ||j        |ddd	d	
          | _	        t          |          | _        |                                  dS )a  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
            hidden states. If `False`, only the hidden states will be returned.
        r   r   r   r&   r   r   r   r   Tr+   r,   r-   r.   r2   r3   N)r=   r>   r*   r  r   r'   r   r)   r   	conv_stemr   encoder	post_init)rH   r*   r  r   rI   s       r   r>   zMobileViTV2Model.__init__Q  s     	   *$rF33RLLLVWce
 
 
 .+$"
 
 
 *&11 	r   c                     |                                 D ]U\  }}| j        j        |         }t          |t                    r)|j        j        D ]}|j                            |           VdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr!  rj   rD   r   r   r   prune_heads)rH   heads_to_prunelayer_indexheadsmobilevitv2_layerr   s         r   _prune_headszMobileViTV2Model._prune_headsm  s     #1"6"6"8"8 	C 	CK $ 2; ?+-=>> C):)F)L C C%%/;;EBBBB		C 	Cr   Nr  r   r   r   c                    ||n| j         j        }||n| j         j        }|t          d          |                     |          }|                     |||          }| j        r"|d         }t          j        |ddgd          }n
|d         }d }|s|||fn|f}||dd          z   S t          |||j
        	          S )
Nz You have to specify pixel_valuesr   r   r   r   Fr   r   )r  pooler_outputr|   )r*   r   use_return_dictr?   r   r!  r  rQ   r  r   r|   )	rH   r  r   r   embedding_outputencoder_outputsr  pooled_outputoutputs	            r   rL   zMobileViTV2Model.forwardw  s    %9$D  $+Jj 	 &1%<kk$+B]?@@@>>,77,,!5# ' 
 
  	! / 2 "J'8r2hPUVVVMM / 2 M 	0;H;T'77[lZnFOABB///7/')7
 
 
 	
r   )T)NNN)rM   rN   rO   r   rP   r>   r*  r   r   rQ   rR   r   r   r   rL   rS   rT   s   @r   r  r  O  s         0       8C C C  04/3&*	'
 '
u|,'
 'tn'
 d^	'

 
u>>	?'
 '
 '
 ^'
 '
 '
 '
 '
r   r  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                 dee	         deej                 dee	         de
eef         f
d	            Z xZS )!MobileViTV2ForImageClassificationr*   r   Nc                 `   t                                          |           |j        | _        t          |          | _        t          d|j        z  d          }|j        dk    rt          j        ||j                  nt          j	                    | _
        |                                  d S )Nr   r   r   r   )in_featuresout_features)r=   r>   
num_labelsr  r  r   r   r   r  Identity
classifierr"  )rH   r*   r,   rI   s      r   r>   z*MobileViTV2ForImageClassification.__init__  s        ++F33%cF,C&CQOOO  1$$ I,V=NOOOO 	 	r   r  r   labelsr   c                 @   ||n| j         j        }|                     |||          }|r|j        n|d         }|                     |          }d}||                     ||| j                   }|s|f|dd         z   }	||f|	z   n|	S t          |||j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr,  r   r   )losslogitsr|   )r*   r/  r  r.  r<  loss_functionr   r|   )
rH   r  r   r=  r   outputsr2  r@  r?  r3  s
             r   rL   z)MobileViTV2ForImageClassification.forward  s     &1%<kk$+B]""<FZhs"tt1<L--'!*//%%ffdkBBD 	FY,F)-)9TGf$$vE3!/
 
 
 	
r   NNNN)rM   rN   rO   r   r>   r   r   rQ   rR   rP   r   r   r   rL   rS   rT   s   @r   r6  r6    s        0 T      "  04/3)-&*!
 !
u|,!
 'tn!
 &	!

 d^!
 
u::	;!
 !
 !
 ^!
 !
 !
 !
 !
r   r6  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTV2ASPPPoolingr*   r+   r,   r   Nc           	          t                                                       t          j        d          | _        t          |||dddd          | _        d S )Nr   )r   Tr   r  )r=   r>   r   AdaptiveAvgPool2dglobal_poolr)   r   )rH   r*   r+   r,   rI   s       r   r>   zMobileViTV2ASPPPooling.__init__  s^    /A>>>,#%"!
 
 
r   rJ   c                     |j         dd          }|                     |          }|                     |          }t          j                            ||dd          }|S )Nr-  bilinearFsizemodealign_corners)r   rH  r   r   r   interpolate)rH   rJ   spatial_sizes      r   rL   zMobileViTV2ASPPPooling.forward  sZ    ~bcc*##H--==**=,,XLzin,oor   rq   rT   s   @r   rE  rE    s        
0 
s 
RU 
Z^ 
 
 
 
 
 
         r   rE  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTV2ASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r*   r   Nc                    t                                                       t          dj        z  d          }|j        t          j                  dk    rt          d          t          j	                    | _
        t          dd          }| j
                            |           | j
                            fd	j        D                        t                    }| j
                            |           t          d
z  dd          | _        t          j        j                  | _        d S )Nr   r   r   r   z"Expected 3 values for atrous_ratesr   r   rY   c           
      :    g | ]}t          d |d          S )r   r   )r+   r,   r-   r1   r3   )r)   )r   rater*   r+   r,   s     r   r   z,MobileViTV2ASPP.__init__.<locals>.<listcomp>  sL     
 
 
  % +!- !!#)  
 
 
r      rv   )r=   r>   r   r   aspp_out_channelslenatrous_ratesr?   r   ri   convsr)   rl   extendrE  projectry   aspp_dropout_probr   )rH   r*   encoder_out_channelsin_projection
pool_layerr+   r,   rI   s    `   @@r   r>   zMobileViTV2ASPP.__init__  so   -cF4K.KUVWWW*/v"##q((ABBB]__
,#%!
 
 
 	
-(((

 
 
 
 
 
 #/
 
 
	
 	
 	
 ,FKNN

*%%%+L 0|YZkq
 
 
 zF$<===r   rJ   c                     g }| j         D ] }|                     ||                     !t          j        |d          }|                     |          }|                     |          }|S )Nr   r   )rZ  rl   rQ   catr\  r   )rH   rJ   pyramidconvpooled_featuress        r   rL   zMobileViTV2ASPP.forward)  sq    J 	+ 	+DNN44>>****)G+++,,w//,,77r   
rM   rN   rO   rd   r   r>   rQ   rR   rL   rS   rT   s   @r   rR  rR    s}         *>0 *>T *> *> *> *> *> *>X         r   rR  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTV2DeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r*   r   Nc           	          t                                                       t          |          | _        t	          j        |j                  | _        t          ||j	        |j
        dddd          | _        d S )Nr   FT)r+   r,   r-   r2   r3   r0   )r=   r>   rR  asppr   	Dropout2dclassifier_dropout_probr   r)   rW  r:  r<  rH   r*   rI   s     r   r>   zMobileViTV2DeepLabV3.__init__:  sq    #F++	|F$BCC.0*# 
 
 
r   r|   c                     |                      |d                   }|                     |          }|                     |          }|S )Nr   )rj  r   r<  )rH   r|   rJ   s      r   rL   zMobileViTV2DeepLabV3.forwardJ  s?    99]2.//<<))??8,,r   rf  rT   s   @r   rh  rh  5  s|         
0 
T 
 
 
 
 
 
 U\ el        r   rh  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                 deej                 dee	         dee	         de
eef         f
d	            Z xZS )"MobileViTV2ForSemanticSegmentationr*   r   Nc                     t                                          |           |j        | _        t          |d          | _        t          |          | _        |                                  d S )NF)r  )r=   r>   r:  r  r  rh  segmentation_headr"  rm  s     r   r>   z+MobileViTV2ForSemanticSegmentation.__init__W  sb        ++F%HHH!5f!=!= 	r   r  r=  r   r   c                 B   ||n| j         j        }||n| j         j        }|| j         j        dk    rt	          d          |                     |d|          }|r|j        n|d         }|                     |          }d}|Vt          j	        
                    ||j        dd         dd	          }	t          | j         j        
          }
 |
|	|          }|s)|r|f|dd         z   }n|f|dd         z   }||f|z   n|S t          |||r|j        ndd          S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr,  r-  rJ  FrK  )ignore_indexr   )r?  r@  r|   
attentions)r*   r   r/  r:  r?   r  r|   rr  r   r   rO  r   r   semantic_loss_ignore_indexr   )rH   r  r=  r   r   rB  encoder_hidden_statesr@  r?  upsampled_logitsloss_fctr3  s               r   rL   z*MobileViTV2ForSemanticSegmentation.forwarda  s   H %9$D  $+Jj 	 &1%<kk$+B]$+"8A"="=NOOO""!%# # 
 
 :E T 5 5'RS*''(=>>!}88V\"##.Zu  9     (T[5[\\\H8,f55D 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T	
 
 
 	
r   rC  )rM   rN   rO   r   r>   r   r   rQ   rR   rP   r   r   r   rL   rS   rT   s   @r   rp  rp  Q  s        0 T        04)-/3&*I
 I
u|,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
 I
 ^I
 I
 I
 I
 I
r   rp  )r6  rp  r  r  )r   N)1rd   typingr   r   rQ   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   utilsr   r   configuration_mobilevitv2r   
get_loggerrM   loggerr   r   r   r'   r  r)   rV   rf   rs   r   r   r   r   r   r  r  r6  rE  rR  rh  rp  __all__r   r   r   <module>r     s  " !   " " " " " " " "        % % % % % % ! ! ! ! ! ! 9 9 9 9 9 9            . - - - - - , , , , , , , , 8 8 8 8 8 8 
	H	%	% #  HSM UX     ).fe - - - - -Y^ - - - -
= = = = =29 = = =B-F -F -F -F -F") -F -F -Fb    	   .< < < < <RY < < <~& & & & &RY & & &R    ")   6    RY   0o o o o o1 o o odcp cp cp cp cp cp cp cpL * * * * * * * *( O
 O
 O
 O
 O
1 O
 O
 O
d   4
 4
 4
 4
 4
(B 4
 4
 4
p    RY   09 9 9 9 9bi 9 9 9z    29   8   
U
 U
 U
 U
 U
)C U
 U
 
U
p  r   