
     `i              	          d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZ  ej        e          Zd?de de dee          de fdZ! G d dej"                  Z# G d dej"                  Z$ G d dej"                  Z% G d dej"                  Z& G d dej"                  Z' G d d ej"                  Z( G d! d"ej"                  Z) G d# d$ej"                  Z* G d% d&ej"                  Z+ G d' d(ej"                  Z, G d) d*e          Z- G d+ d,ej"                  Z.e G d- d.e                      Z/e G d/ d0e/                      Z0 ed12           G d3 d4e/                      Z1 G d5 d6ej"                  Z2 G d7 d8ej"                  Z3 G d9 d:ej"                  Z4 ed;2           G d< d=e/                      Z5g d>Z6dS )@zPyTorch MobileViT model.    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfig   valuedivisor	min_valuereturnc                     ||}t          |t          | |dz  z             |z  |z            }|d| z  k     r||z  }t          |          S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_values       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler!   +   s^     	Is57Q;#6777BWLMMI3;W	y>>    c                        e Zd Z	 	 	 	 	 	 ddededededed	ed
edededeeef         ddf fdZde	j
        de	j
        fdZ xZS )MobileViTConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 n   t                                                       t          |dz
  dz            |z  }||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          t	          j        ||||||||d		  	        | _        |	rt	          j        |d
ddd          | _        nd | _        |
rjt          |
t                    rt          |
         | _        d S t          |j        t                    rt          |j                 | _        d S |j        | _        d S d | _        d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r&   r'   r(   r)   paddingr,   r*   r+   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r1   	__class__s               r    r9   zMobileViTConvLayer.__init__;   sz    	{Q!+,,x71$$dddTZdddeee& A%%fffV\fffggg9#%# 

 

 

  		&!#)$(" " "D "&D 	#.#.. 4"("8F-s33 4"():";"("3"DOOOr"   featuresc                     |                      |          }| j        |                     |          }| j        |                     |          }|S N)r<   r>   rA   )rC   rE   s     r    forwardzMobileViTConvLayer.forwardq   sO    ##H--)))(33H?&x00Hr"   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr   r@   r9   torchTensorrH   __classcell__rD   s   @r    r$   r$   :   s         "&+/4# 4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4# 4# 4# 4# 4#l         r"   r$   c                   d     e Zd ZdZ	 ddedededededd	f fd
Zdej        dej        fdZ	 xZ
S )MobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r   r%   r&   r'   r)   r,   r   Nc           	         t                                                       t          t          t	          ||j        z                      d          }|dvrt          d| d          |dk    o||k    | _        t          |||d          | _	        t          |||d|||          | _
        t          |||dd	
          | _        d S )Nr   )r   r   zInvalid stride .r   r&   r'   r(   r   )r&   r'   r(   r)   r*   r,   Fr&   r'   r(   r.   )r8   r9   r!   r   roundexpand_ratior:   use_residualr$   
expand_1x1conv_3x3
reduce_1x1)rC   r%   r&   r'   r)   r,   expanded_channelsrD   s          r    r9   z"MobileViTInvertedResidual.__init__   s     	*3u[6CV5V/W/W+X+XZ[\\8v888999#q[K{l/J,:KYZ
 
 
 +)*$
 
 
 -)% 
 
 
r"   rE   c                     |}|                      |          }|                     |          }|                     |          }| j        r||z   n|S rG   )rZ   r[   r\   rY   )rC   rE   residuals      r    rH   z!MobileViTInvertedResidual.forward   sR    ??8,,==**??8,,&*&7Ex(""XEr"   r   )rI   rJ   rK   __doc__r   r   r9   rM   rN   rH   rO   rP   s   @r    rR   rR   z   s         
 jk
 
%
47
GJ
TW
cf
	
 
 
 
 
 
BF F F F F F F F F Fr"   rR   c                   `     e Zd Z	 ddedededededdf fd	Zd
ej        dej        fdZ xZ	S )MobileViTMobileNetLayerr   r%   r&   r'   r)   
num_stagesr   Nc                 
   t                                                       t          j                    | _        t          |          D ]9}t          ||||dk    r|nd          }| j                            |           |}:d S )Nr   r   )r&   r'   r)   )r8   r9   r   
ModuleListlayerrangerR   append)	rC   r%   r&   r'   r)   rd   irg   rD   s	           r    r9   z MobileViTMobileNetLayer.__init__   s     	]__
z"" 	' 	'A-')!"avvQ	  E Je$$$&KK	' 	'r"   rE   c                 0    | j         D ]} ||          }|S rG   rg   )rC   rE   layer_modules      r    rH   zMobileViTMobileNetLayer.forward   s)     J 	. 	.L#|H--HHr"   )r   r   
rI   rJ   rK   r   r   r9   rM   rN   rH   rO   rP   s   @r    rc   rc      s        op' '%'47'GJ'TW'il'	' ' ' ' ' '          r"   rc   c                   P     e Zd Zdededdf fdZdej        dej        fdZ xZ	S )MobileViTSelfAttentionr%   hidden_sizer   Nc                 2   t                                                       ||j        z  dk    rt          d| d|j         d          |j        | _        t	          ||j        z            | _        | j        | j        z  | _        t          j        || j        |j	                  | _
        t          j        || j        |j	                  | _        t          j        || j        |j	                  | _        t          j        |j                  | _        d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rT   )r+   )r8   r9   num_attention_headsr:   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrC   r%   rq   rD   s      r    r9   zMobileViTSelfAttention.__init__   s   33q887; 7 737 7 7  
 $*#= #&{V5O'O#P#P !58PPY{D,>V_UUU
9[$*<6?SSSY{D,>V_UUU
z&"EFFr"   hidden_statesc                    |j         \  }}}|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }t          j	        ||                    dd                    }|t          j        | j                  z  }t          j                            |d          }	|                     |	          }	t          j	        |	|          }
|
                    dddd                                          }
|
                                d d         | j        fz   } |
j        | }
|
S )Nr   r   dimr   r   )shaperx   viewrs   rt   	transposery   r   rM   matmulmathsqrtr   
functionalsoftmaxr|   permute
contiguoussizeru   )rC   r~   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes               r    rH   zMobileViTSelfAttention.forward   s   $1$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDr"   rn   rP   s   @r    rp   rp      s        G GS GT G G G G G G&"U\ "el " " " " " " " "r"   rp   c                   P     e Zd Zdededdf fdZdej        dej        fdZ xZ	S )MobileViTSelfOutputr%   rq   r   Nc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S rG   r8   r9   r   rv   denserz   hidden_dropout_probr|   r}   s      r    r9   zMobileViTSelfOutput.__init__   sD    Y{K88
z&"<==r"   r~   c                 Z    |                      |          }|                     |          }|S rG   r   r|   rC   r~   s     r    rH   zMobileViTSelfOutput.forward   s*    

=11]33r"   rn   rP   s   @r    r   r      sx        > >S >T > > > > > >
U\ el        r"   r   c                   l     e Zd Zdededdf fdZdee         ddfdZdej	        dej	        fd	Z
 xZS )
MobileViTAttentionr%   rq   r   Nc                     t                                                       t          ||          | _        t	          ||          | _        t                      | _        d S rG   )r8   r9   rp   	attentionr   outputsetpruned_headsr}   s      r    r9   zMobileViTAttention.__init__  sM    /DD)&+>>EEr"   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r   rs   rt   r   r   rx   ry   r   r   r   ru   union)rC   r   indexs      r    prune_headszMobileViTAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r"   r~   c                 Z    |                      |          }|                     |          }|S rG   )r   r   )rC   r~   self_outputsattention_outputs       r    rH   zMobileViTAttention.forward  s+    ~~m44;;|44r"   )rI   rJ   rK   r   r   r9   r   r   rM   rN   rH   rO   rP   s   @r    r   r     s        " "S "T " " " " " ";S ;d ; ; ; ;$ U\  el                r"   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTIntermediater%   rq   intermediate_sizer   Nc                     t                                                       t          j        ||          | _        t          |j        t                    rt          |j                 | _	        d S |j        | _	        d S rG   )
r8   r9   r   rv   r   r?   rB   r@   r   intermediate_act_fnrC   r%   rq   r   rD   s       r    r9   zMobileViTIntermediate.__init__&  si    Y{,=>>
f'-- 	9'-f.?'@D$$$'-'8D$$$r"   r~   c                 Z    |                      |          }|                     |          }|S rG   )r   r   r   s     r    rH   zMobileViTIntermediate.forward.  s,    

=1100??r"   rn   rP   s   @r    r   r   %  s        9 9S 9UX 9]a 9 9 9 9 9 9U\ el        r"   r   c                   b     e Zd Zdedededdf fdZdej        dej        dej        fd	Z xZ	S )
MobileViTOutputr%   rq   r   r   Nc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S rG   r   r   s       r    r9   zMobileViTOutput.__init__5  sE    Y0+>>
z&"<==r"   r~   input_tensorc                 d    |                      |          }|                     |          }||z   }|S rG   r   )rC   r~   r   s      r    rH   zMobileViTOutput.forward:  s4    

=11]33%4r"   rn   rP   s   @r    r   r   4  s        > >S >UX >]a > > > > > >
U\  RWR^        r"   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTTransformerLayerr%   rq   r   r   Nc                 J   t                                                       t          ||          | _        t	          |||          | _        t          |||          | _        t          j	        ||j
                  | _        t          j	        ||j
                  | _        d S )Nr4   )r8   r9   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r    r9   z"MobileViTTransformerLayer.__init__B  s    +FK@@1&+GXYY%fk;LMM "[f>S T T T!|KV=RSSSr"   r~   c                     |                      |                     |                    }||z   }|                     |          }|                     |          }|                     ||          }|S rG   )r   r   r   r   r   )rC   r~   r   layer_outputs       r    rH   z!MobileViTTransformerLayer.forwardJ  sk    >>$*?*?*N*NOO(=8++M::((66{{<??r"   rn   rP   s   @r    r   r   A  s        T TS TUX T]a T T T T T TU\ el        r"   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTTransformerr%   rq   rd   r   Nc           	         t                                                       t          j                    | _        t          |          D ]C}t          ||t          ||j        z                      }| j        	                    |           Dd S )N)rq   r   )
r8   r9   r   rf   rg   rh   r   r   	mlp_ratiori   )rC   r%   rq   rd   r   transformer_layerrD   s         r    r9   zMobileViTTransformer.__init__U  s    ]__
z"" 	1 	1A 9'"%kF4D&D"E"E! ! !
 J/0000	1 	1r"   r~   c                 0    | j         D ]} ||          }|S rG   rl   )rC   r~   rm   s      r    rH   zMobileViTTransformer.forwarda  s*     J 	8 	8L(L77MMr"   rn   rP   s   @r    r   r   T  s        
1 
1S 
1c 
1VZ 
1 
1 
1 
1 
1 
1U\ el        r"   r   c                        e Zd ZdZ	 ddedededededed	ed
df fdZdej        d
e	ej        e
f         fdZdej        de
d
ej        fdZdej        d
ej        fdZ xZS )MobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r   r%   r&   r'   r)   rq   rd   r,   r   Nc                 <   t                                                       |j        | _        |j        | _        |dk    r/t          ||||dk    r|nd|dk    r|dz  nd          | _        |}nd | _        t          ||||j                  | _	        t          |||ddd          | _
        t          |||          | _        t          j        ||j                  | _        t          |||d          | _        t          |d|z  ||j                  | _        d S )	Nr   r   )r&   r'   r)   r,   rU   F)r&   r'   r(   r-   r.   )rq   rd   r   )r8   r9   
patch_sizepatch_widthpatch_heightrR   downsampling_layerr$   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rC   r%   r&   r'   r)   rq   rd   r,   rD   s	           r    r9   zMobileViTLayer.__init__l  sg    	!,"-Q;;&?')!)QvvA*2Q,,QA' ' 'D# 'KK&*D#*#$/	
 
 
 +#$# 
 
 
 0#!
 
 
 kv7LMMM1+ST 
  
  
 )KkW]Wn
 
 
r"   rE   c                    | j         | j        }}t          ||z            }|j        \  }}}}t          j                                        r't          t	          j        ||z            |z            n&t          t          j        ||z            |z            }	t          j                                        r't          t	          j        ||z            |z            n&t          t          j        ||z            |z            }
d}|
|k    s|	|k    r't          j                            ||	|
fdd          }d}|
|z  }|	|z  }||z  }|                    ||z  |z  |||          }|                    dd          }|                    ||||          }|                    dd          }|                    ||z  |d          }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r   r   r   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rM   jit
is_tracingr   ceilr   r   r   r   reshaper   )rC   rE   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r    	unfoldingzMobileViTLayer.unfolding  s   $($4d6G\|344
8@5
Hk: y##%%KIej|!;<<|KLLLTY{\9::\IJJ 	 y##%%HIejk!9::[HIIITYzK788;FGG 	 
""jK&?&?}00
I6ZW\ 1  H K ${2%5&8 ""!$44lOU`
 
 ##Aq))//*hZPP##Aq))//*z"9;KK &z2$ &&!0"2
 
	 	!!r"   r   r   c                    | j         | j        }}t          ||z            }|d         }|d         }|d         }|d         }	|d         }
|                                                    |||d          }|                    dd          }|                    ||z  |	z  |
||          }|                    dd	          }|                    |||	|z  |
|z            }|d
         r)t          j        	                    ||d         dd          }|S )Nr   r   r   r   r   r   r   r   r   r   r   r   Fr   )
r   r   r   r   r   r   r   r   r   r   )rC   r   r   r   r   r   r   r   r   r   r   rE   s               r    foldingzMobileViTLayer.folding  sA   $($4d6G\|344
|,
Z(.$%9:#$78 %%'',,Z[RTUU%%a++##!$44o|U`
 
 %%a++##"2\"A?U`C`
 
 ]# 	}00y5JV[ 1  H r"   c                    | j         r|                      |          }|}|                     |          }|                     |          }|                     |          \  }}|                     |          }|                     |          }|                     ||          }|                     |          }|                     t          j
        ||fd                    }|S Nr   r   )r   r   r   r   r   r   r   r   r   rM   cat)rC   rE   r_   r   r   s        r    rH   zMobileViTLayer.forward  s    " 	9..x88H ==**==** "^^H55 ""7++..)) <<33''11;;uy(H)=1EEEFFr"   r`   )rI   rJ   rK   ra   r   r   r9   rM   rN   tupledictr   r   rH   rO   rP   s   @r    r   r   g  s$         8
 8
8
 8
 	8

 8
 8
 8
 8
 
8
 8
 8
 8
 8
 8
t1"%, 1"5t9K3L 1" 1" 1" 1"fu|      :         r"   r   c                   `     e Zd Zdeddf fdZ	 	 ddej        ded	edee	e
f         fd
Z xZS )MobileViTEncoderr%   r   Nc           	         t                                                       || _        t          j                    | _        d| _        dx}}|j        dk    rd}d}n|j        dk    rd}d}t          ||j	        d         |j	        d         dd          }| j        
                    |           t          ||j	        d         |j	        d         dd	          }| j        
                    |           t          ||j	        d         |j	        d	         d|j        d         d
          }| j        
                    |           |r|dz  }t          ||j	        d	         |j	        d         d|j        d         d|          }| j        
                    |           |r|dz  }t          ||j	        d         |j	        d         d|j        d         d	|          }	| j        
                    |	           d S )NFr   T   r   r   )r&   r'   r)   rd   r   r   )r&   r'   r)   rq   rd      )r&   r'   r)   rq   rd   r,      )r8   r9   r%   r   rf   rg   gradient_checkpointingoutput_striderc   neck_hidden_sizesri   r   hidden_sizes)rC   r%   dilate_layer_4dilate_layer_5r,   layer_1layer_2layer_3layer_4layer_5rD   s             r    r9   zMobileViTEncoder.__init__  s9   ]__
&+# +0/1$$!N!NN!R''!N)031!4
 
 
 	
'""")031!4
 
 
 	
'""" 031!4+A.
 
 
 	
'""" 	MH 031!4+A.
 
 
 	
'""" 	MH 031!4+A.
 
 
 	
'"""""r"   FTr~   output_hidden_statesreturn_dictc                     |rdnd }t          | j                  D ]\  }} ||          }|r||fz   }|st          d ||fD                       S t          ||          S )N c              3      K   | ]}||V  	d S rG   r  ).0vs     r    	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>j  s"      XXq!-----XXr"   )last_hidden_stater~   )	enumeraterg   r   r
   )rC   r~   r  r  all_hidden_statesrj   rm   s          r    rH   zMobileViTEncoder.forward[  s     #7@BBD(44 	I 	IOA|(L77M# I$58H$H! 	YXX]4E$FXXXXXX-]noooor"   )FT)rI   rJ   rK   r   r9   rM   rN   rL   r   r   r
   rH   rO   rP   s   @r    r   r     s        H# H#4 H# H# H# H# H# H#Z &+ 	p p|p #p 	p
 
u44	5p p p p p p p pr"   r   c                   F    e Zd ZU eed<   dZdZdZdgZde	j
        ddfd	ZdS )
MobileViTPreTrainedModelr%   	mobilevitpixel_valuesTr   moduler   Nc                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)r?   r   rv   r;   r=   weightdatanormal_r%   initializer_ranger+   zero_r   fill_)rC   r  s     r    _init_weightsz&MobileViTPreTrainedModel._init_weightsw  s    fry")R^DEE 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r"   )rI   rJ   rK   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler'  r  r"   r    r  r  o  s`         #$O&*#)*
*BI 
*$ 
* 
* 
* 
* 
* 
*r"   r  c                        e Zd Zddedef fdZd Ze	 	 	 ddee	j
                 dee         d	ee         d
eeef         fd            Z xZS )MobileViTModelTr%   expand_outputc                 r   t                                          |           || _        || _        t	          ||j        |j        d         dd          | _        t          |          | _	        | j        r.t	          ||j        d         |j        d         d          | _
        |                                  d	S )
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r&   r'   r(   r)   r     r   rU   N)r8   r9   r%   r0  r$   num_channelsr  	conv_stemr   encoderconv_1x1_exp	post_init)rC   r%   r0  rD   s      r    r9   zMobileViTModel.__init__  s     	   *++1!4
 
 
 (// 	 2"4Q7#5a8	! ! !D 	r"   c                     |                                 D ]U\  }}| j        j        |         }t          |t                    r)|j        j        D ]}|j                            |           VdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr5  rg   r?   r   r   r   r   )rC   heads_to_prunelayer_indexr   mobilevit_layerr   s         r    _prune_headszMobileViTModel._prune_heads  s     #1"6"6"8"8 	C 	CK"l0=O/>:: C)8)D)J C C%%/;;EBBBB		C 	Cr"   Nr  r  r  r   c                    ||n| j         j        }||n| j         j        }|t          d          |                     |          }|                     |||          }| j        r5|                     |d                   }t          j	        |ddgd          }n
|d         }d }|s|||fn|f}||dd          z   S t          |||j        	          S )
Nz You have to specify pixel_valuesr  r  r   r   r   F)r   keepdimr   )r  pooler_outputr~   )r%   r  use_return_dictr:   r4  r5  r0  r6  rM   r  r   r~   )	rC   r  r  r  embedding_outputencoder_outputsr  pooled_outputr   s	            r    rH   zMobileViTModel.forward  s'    %9$D  $+Jj 	 &1%<kk$+B]?@@@>>,77,,!5# ' 
 
  	! $ 1 1/!2D E E "J'8r2hPUVVVMM / 2 M 	0;H;T'77[lZnFOABB///7/')7
 
 
 	
r"   )T)NNN)rI   rJ   rK   r   rL   r9   r=  r   r   rM   rN   r   r   r   rH   rO   rP   s   @r    r/  r/    s          t      >C C C  04/3&*	'
 '
u|,'
 'tn'
 d^	'

 
u>>	?'
 '
 '
 ^'
 '
 '
 '
 '
r"   r/  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                 dee	         deej                 dee	         de
eef         f
d	            Z xZS )MobileViTForImageClassificationr%   r   Nc                    t                                          |           |j        | _        t          |          | _        t          j        |j        d          | _        |j        dk    r%t          j	        |j
        d         |j                  nt          j                    | _        |                                  d S )NT)inplacer   r   )r8   r9   
num_labelsr/  r  r   rz   classifier_dropout_probr|   rv   r  Identity
classifierr7  rC   r%   rD   s     r    r9   z(MobileViTForImageClassification.__init__  s        +'// z&"@$OOOJPJ[^_J_J_BIf.r2F4EFFFegeperer 	
 	r"   r  r  labelsr  c                 f   ||n| j         j        }|                     |||          }|r|j        n|d         }|                     |                     |                    }d}||                     ||| j                   }|s|f|dd         z   }	||f|	z   n|	S t          |||j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr?  r   r   )losslogitsr~   )	r%   rB  r  rA  rN  r|   loss_functionr   r~   )
rC   r  r  rP  r  outputsrE  rS  rR  r   s
             r    rH   z'MobileViTForImageClassification.forward  s     &1%<kk$+B]..DXfq.rr1<L--'!*m!<!<==%%ffdkBBD 	FY,F)-)9TGf$$vE3!/
 
 
 	
r"   NNNN)rI   rJ   rK   r   r9   r   r   rM   rN   rL   r   r   r   rH   rO   rP   s   @r    rH  rH    s         4        04/3)-&*!
 !
u|,!
 'tn!
 &	!

 d^!
 
u::	;!
 !
 !
 ^!
 !
 !
 !
 !
r"   rH  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTASPPPoolingr%   r&   r'   r   Nc           	          t                                                       t          j        d          | _        t          |||dddd          | _        d S )Nr   )output_sizeTrelu)r&   r'   r(   r)   r-   r.   )r8   r9   r   AdaptiveAvgPool2dglobal_poolr$   r   )rC   r%   r&   r'   rD   s       r    r9   zMobileViTASPPPooling.__init__  s^    /A>>>*#%"!
 
 
r"   rE   c                     |j         dd          }|                     |          }|                     |          }t          j                            ||dd          }|S )Nr   r   Fr   )r   r]  r   r   r   r   )rC   rE   spatial_sizes      r    rH   zMobileViTASPPPooling.forward%  sZ    ~bcc*##H--==**=,,XLzin,oor"   rn   rP   s   @r    rX  rX    s        
 
S 
PS 
X\ 
 
 
 
 
 
         r"   rX  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r%   r   Nc                 v   t                                                       j        d         j        t	          j                  dk    rt          d          t          j                    | _	        t          dd          }| j	                            |           | j	                            fdj        D                        t                    }| j	                            |           t          dz  dd          | _        t          j        j        	          | _        d S )
Nr   r   z"Expected 3 values for atrous_ratesr   r[  rV   c           
      :    g | ]}t          d |d          S )r   r[  )r&   r'   r(   r,   r.   )r$   )r  rater%   r&   r'   s     r    
<listcomp>z*MobileViTASPP.__init__.<locals>.<listcomp>G  sL     
 
 
  # +!- !!#)  
 
 
r"   r  )p)r8   r9   r  aspp_out_channelsr   atrous_ratesr:   r   rf   convsr$   ri   extendrX  projectrz   aspp_dropout_probr|   )rC   r%   in_projection
pool_layerr&   r'   rD   s    `  @@r    r9   zMobileViTASPP.__init__2  s[   .r2/v"##q((ABBB]__
*#%!
 
 
 	
-(((

 
 
 
 
 
 #/
 
 
	
 	
 	
 *&+|LL

*%%%)L 0|YZkq
 
 
 zF$<===r"   rE   c                     g }| j         D ] }|                     ||                     !t          j        |d          }|                     |          }|                     |          }|S r   )ri  ri   rM   r   rk  r|   )rC   rE   pyramidconvpooled_featuress        r    rH   zMobileViTASPP.forward]  sq    J 	+ 	+DNN44>>****)G+++,,w//,,77r"   
rI   rJ   rK   ra   r   r9   rM   rN   rH   rO   rP   s   @r    ra  ra  -  s|         )> )>4 )> )> )> )> )> )>V         r"   ra  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r%   r   Nc           	          t                                                       t          |          | _        t	          j        |j                  | _        t          ||j	        |j
        dddd          | _        d S )Nr   FT)r&   r'   r(   r-   r.   r+   )r8   r9   ra  asppr   	Dropout2drL  r|   r$   rg  rK  rN  rO  s     r    r9   zMobileViTDeepLabV3.__init__m  sq    !&))	|F$BCC,0*# 
 
 
r"   r~   c                     |                      |d                   }|                     |          }|                     |          }|S )Nr   )rw  r|   rN  )rC   r~   rE   s      r    rH   zMobileViTDeepLabV3.forward}  s?    99]2.//<<))??8,,r"   rs  rP   s   @r    ru  ru  h  s{         
 
4 
 
 
 
 
 
 U\ el        r"   ru  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                 deej                 dee	         dee	         de
eef         f
d	            Z xZS ) MobileViTForSemanticSegmentationr%   r   Nc                     t                                          |           |j        | _        t          |d          | _        t          |          | _        |                                  d S )NF)r0  )r8   r9   rK  r/  r  ru  segmentation_headr7  rO  s     r    r9   z)MobileViTForSemanticSegmentation.__init__  sa        +'eDDD!3F!;!; 	r"   r  rP  r  r  c                 B   ||n| j         j        }||n| j         j        }|| j         j        dk    rt	          d          |                     |d|          }|r|j        n|d         }|                     |          }d}|Vt          j	        
                    ||j        dd         dd	          }	t          | j         j        
          }
 |
|	|          }|s)|r|f|dd         z   }n|f|dd         z   }||f|z   n|S t          |||r|j        ndd          S )a{  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr?  r   r   Fr   )ignore_indexr   )rR  rS  r~   
attentions)r%   r  rB  rK  r:   r  r~   r}  r   r   r   r   r   semantic_loss_ignore_indexr   )rC   r  rP  r  r  rU  encoder_hidden_statesrS  rR  upsampled_logitsloss_fctr   s               r    rH   z(MobileViTForSemanticSegmentation.forward  s   H %9$D  $+Jj 	 &1%<kk$+B]$+"8A"="=NOOO..!%# ! 
 
 :E T 5 5'RS*''(=>>!}88V\"##.Zu  9     (T[5[\\\H8,f55D 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T	
 
 
 	
r"   rV  )rI   rJ   rK   r   r9   r   r   rM   rN   rL   r   r   r   rH   rO   rP   s   @r    r{  r{    s         4        04)-/3&*I
 I
u|,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
 I
 ^I
 I
 I
 I
 I
r"   r{  )rH  r{  r/  r  )r   N)7ra   r   typingr   r   rM   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrI   loggerr   r!   r-  r$   rR   rc   rp   r   r   r   r   r   r   r   r   r  r/  rH  rX  ra  ru  r{  __all__r  r"   r    <module>r     sS  "    " " " " " " " "        % % % % % % ! ! ! ! ! ! 9 9 9 9 9 9            . - - - - - Q Q Q Q Q Q Q Q 7 7 7 7 7 7 7 7 7 7 4 4 4 4 4 4 
	H	%	% #  HSM UX    = = = = = = = =@-F -F -F -F -F	 -F -F -F`    bi   .6 6 6 6 6RY 6 6 6r	 	 	 	 	") 	 	 	               >    BI   
 
 
 
 
bi 
 
 
    	   &    29   &f f f f f/ f f fR\p \p \p \p \pry \p \p \p~ * * * * * * * *( R
 R
 R
 R
 R
- R
 R
 R
j   2
 2
 2
 2
 2
&> 2
 2
 2
j    29   08 8 8 8 8BI 8 8 8v       8   
U
 U
 U
 U
 U
'? U
 U
 
U
p  r"   