
     `i{N              	           d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ  ej        e          ZdZdZg dZdZdZd7dej         de!de"dej         fdZ# G d dej$                  Z% G d dej$                  Z& G d dej$                  Z' G d dej$                  Z( G d  d!ej$                  Z) G d" d#ej$                  Z* G d$ d%ej$                  Z+ G d& d'ej$                  Z, G d( d)ej$                  Z- G d* d+ej$                  Z. G d, d-e          Z/d.Z0d/Z1 ed0e0           G d1 d2e/                      Z2 ed3e0           G d4 d5e/                      Z3g d6Z4dS )8z-PyTorch Visual Attention Network (VAN) model.    N)OrderedDict)OptionalUnion)nn   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )	VanConfigr   z!Visual-Attention-Network/van-base)r   i      r   ztabby, tabby cat        Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/deprecated/van/modeling_van.py	drop_pathr&   1   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FM    c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
VanDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 V    t                                                       || _        d S N)super__init__r   )selfr   	__class__s     r%   r-   zVanDropPath.__init__H   s$    "r'   hidden_statesc                 8    t          || j        | j                  S r+   )r&   r   r   )r.   r0   s     r%   forwardzVanDropPath.forwardL   s    FFFr'   c                     d| j          S )Nzp=)r   )r.   s    r%   
extra_reprzVanDropPath.extra_reprO   s    $DN$$$r'   r+   )__name__
__module____qualname____doc__r   floatr-   r   Tensorr2   strr4   __classcell__r/   s   @r%   r)   r)   E   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r'   r)   c            	       Z     e Zd ZdZddedededef fdZd	ej        d
ej        fdZ xZ	S )VanOverlappingPatchEmbeddera  
    Downsamples the input using a patchify operation with a `stride` of 4 by default making adjacent windows overlap by
    half of the area. From [PVTv2: Improved Baselines with Pyramid Vision
    Transformer](https://huggingface.co/papers/2106.13797).
    r   r   in_channelshidden_size
patch_sizestridec                     t                                                       t          j        |||||dz            | _        t          j        |          | _        d S )N   )kernel_sizerC   padding)r,   r-   r   Conv2dconvolutionBatchNorm2dnormalization)r.   r@   rA   rB   rC   r/   s        r%   r-   z$VanOverlappingPatchEmbedder.__init__Z   s^    9*VU_cdUd
 
 
  ^K88r'   r   r   c                 Z    |                      |          }|                     |          }|S r+   )rI   rK   )r.   r   hidden_states      r%   r2   z#VanOverlappingPatchEmbedder.forwarda   s.    ''..)),77r'   )r   r   
r5   r6   r7   r8   intr-   r   r:   r2   r<   r=   s   @r%   r?   r?   S   s         9 9C 9c 9s 9X[ 9 9 9 9 9 9U\ el        r'   r?   c                   b     e Zd ZdZ	 	 ddededededef
 fd	Zd
ej	        dej	        fdZ
 xZS )VanMlpLayerz
    MLP with depth-wise convolution, from [PVTv2: Improved Baselines with Pyramid Vision
    Transformer](https://huggingface.co/papers/2106.13797).
    gelu      ?r@   rA   out_channels
hidden_actdropout_ratec                 |   t                                                       t          j        ||d          | _        t          j        ||dd|          | _        t          |         | _        t          j        |          | _	        t          j        ||d          | _
        t          j        |          | _        d S )Nr   rF      rF   rG   groups)r,   r-   r   rH   in_dense
depth_wiser   
activationDropoutdropout1	out_densedropout2)r.   r@   rA   rT   rU   rV   r/   s         r%   r-   zVanMlpLayer.__init__m   s     		+{JJJ)K!UV_jkkk ,
<00;!LLL
<00r'   rM   r   c                    |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r+   )r\   r]   r^   r`   ra   rb   r.   rM   s     r%   r2   zVanMlpLayer.forward}   sn    }}\22|44|44}}\22~~l33}}\22r'   )rR   rS   )r5   r6   r7   r8   rO   r;   r9   r-   r   r:   r2   r<   r=   s   @r%   rQ   rQ   g   s          !!1 11 1 	1
 1 1 1 1 1 1 1 EL U\        r'   rQ   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )VanLargeKernelAttentionz-
    Basic Large Kernel Attention (LKA).
    rA   c                     t                                                       t          j        ||dd|          | _        t          j        ||ddd|          | _        t          j        ||d	          | _        d S )
N   rE   rZ   r   rY   	   )rF   dilationrG   r[   r   rX   )r,   r-   r   rH   r]   depth_wise_dilated
point_wiser.   rA   r/   s     r%   r-   z VanLargeKernelAttention.__init__   s|    )K!UV_jkkk"$)!aS^#
 #
 #
 )K!LLLr'   rM   r   c                     |                      |          }|                     |          }|                     |          }|S r+   )r]   rk   rl   rd   s     r%   r2   zVanLargeKernelAttention.forward   s=    |44..|<<|44r'   rN   r=   s   @r%   rf   rf      s{         MC M M M M M MEL U\        r'   rf   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )VanLargeKernelAttentionLayerzV
    Computes attention using Large Kernel Attention (LKA) and attends the input.
    rA   c                 p    t                                                       t          |          | _        d S r+   )r,   r-   rf   	attentionrm   s     r%   r-   z%VanLargeKernelAttentionLayer.__init__   s,    0==r'   rM   r   c                 :    |                      |          }||z  }|S r+   )rr   )r.   rM   rr   attendeds       r%   r2   z$VanLargeKernelAttentionLayer.forward   s"    NN<00	)+r'   rN   r=   s   @r%   rp   rp      st         >C > > > > > >EL U\        r'   rp   c                   R     e Zd ZdZd	dedef fdZdej        dej        fdZ	 xZ
S )
VanSpatialAttentionLayerz
    Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
    projection (via conv) + residual connection.
    rR   rA   rU   c           
      @   t                                                       t          j        t	          dt          j        ||d          fdt          |         fg                    | _        t          |          | _	        t          j        ||d          | _
        d S )Nconvr   rX   act)r,   r-   r   
Sequentialr   rH   r   pre_projectionrp   attention_layerpost_projection)r.   rA   rU   r/   s      r%   r-   z!VanSpatialAttentionLayer.__init__   s     mRY{KQOOOPF:./ 
 
  <KHH!ykqQQQr'   rM   r   c                     |}|                      |          }|                     |          }|                     |          }||z   }|S r+   )r{   r|   r}   r.   rM   residuals      r%   r2   z VanSpatialAttentionLayer.forward   sP    **<88++L99++L99#h.r'   )rR   )r5   r6   r7   r8   rO   r;   r-   r   r:   r2   r<   r=   s   @r%   rv   rv      s         
R RC RS R R R R R REL U\        r'   rv   c                   R     e Zd ZdZd	dedef fdZdej        dej        fdZ	 xZ
S )
VanLayerScalingzT
    Scales the inputs by a learnable parameter initialized by `initial_value`.
    {Gz?rA   initial_valuec                     t                                                       t          j        |t	          j        |          z  d          | _        d S )NT)requires_grad)r,   r-   r   	Parameterr   onesweight)r.   rA   r   r/   s      r%   r-   zVanLayerScaling.__init__   sC    l=5:k3J3J#JZ^___r'   rM   r   c                 f    | j                             d                              d          |z  }|S )N)r   	unsqueezerd   s     r%   r2   zVanLayerScaling.forward   s0    {,,R00::2>>Mr'   )r   )r5   r6   r7   r8   rO   r9   r-   r   r:   r2   r<   r=   s   @r%   r   r      s         ` `C ` ` ` ` ` ` `EL U\        r'   r   c            	       ^     e Zd ZdZ	 	 ddedededef fdZd	ej	        d
ej	        fdZ
 xZS )VanLayerzv
    Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
    r   rS   configrA   	mlp_ratiodrop_path_ratec                    t                                                       |dk    rt          |          nt          j                    | _        t          j        |          | _        t          ||j	                  | _
        t          ||j                  | _        t          j        |          | _        t          |||z  ||j	        |j                  | _        t          ||j                  | _        d S )Nr   )r,   r-   r)   r   Identityr&   rJ   pre_normomalizationrv   rU   rr   r   layer_scale_init_valueattention_scalingpost_normalizationrQ   rV   mlpmlp_scaling)r.   r   rA   r   r   r/   s        r%   r-   zVanLayer.__init__   s     	8F8L8L^444RTR]R_R_#%>+#>#> 1+v?PQQ!0f>[!\!\"$."="=y0+v?PRXRe
 
 +;8UVVr'   rM   r   c                 r   |}|                      |          }|                     |          }|                     |          }|                     |          }||z   }|}|                     |          }|                     |          }|                     |          }|                     |          }||z   }|S r+   )r   rr   r   r&   r   r   r   r   s      r%   r2   zVanLayer.forward   s    //==~~l33--l;;~~l33,...|<<xx--''55~~l33,.r'   )r   rS   r5   r6   r7   r8   r   rO   r9   r-   r   r:   r2   r<   r=   s   @r%   r   r      s           #W WW W 	W
 W W W W W W$EL U\        r'   r   c                   n     e Zd ZdZ	 	 ddededededed	ed
edef fdZdej	        dej	        fdZ
 xZS )VanStagez2
    VanStage, consisting of multiple layers.
    r   r   r   r@   rA   rB   rC   depthr   r   c	                    t                                                       t          |||          | _        t	          j        fdt          |          D              | _        t	          j        j	                  | _
        d S )Nc                 6    g | ]}t                     S ))r   r   )r   ).0_r   r   rA   r   s     r%   
<listcomp>z%VanStage.__init__.<locals>.<listcomp>  sF         '#1	    r'   eps)r,   r-   r?   
embeddingsr   rz   rangelayers	LayerNormlayer_norm_epsrK   )
r.   r   r@   rA   rB   rC   r   r   r   r/   s
    ` `   ``r%   r-   zVanStage.__init__  s     	5k;PZ\bccm       u  

  \+6;PQQQr'   rM   r   c                 J   |                      |          }|                     |          }|j        \  }}}}|                    d                              dd          }|                     |          }|                    ||||                              dddd          }|S )NrE   r   r   rY   )r   r   r   flatten	transposerK   viewpermute)r.   rM   
batch_sizerA   heightwidths         r%   r2   zVanStage.forward  s    |44{{<001=1C.
K#++A..88A>>)),77#((VUKPPXXYZ\]_`bcddr'   )r   r   r   r=   s   @r%   r   r      s           #R RR R 	R
 R R R R R R R R R R4	EL 	U\ 	 	 	 	 	 	 	 	r'   r   c                   x     e Zd ZdZdef fdZ	 	 ddej        dee	         dee	         d	e
eef         fd
Z xZS )
VanEncoderz4
    VanEncoder, consisting of multiple stages.
    r   c                     t                                                       t          j        g           | _        |j        }|j        }|j        }|j        }|j	        }d t          j        d|j        t          |j                  d          D             }t          t          ||||||                    D ]U\  }\  }	}
}}}}|dk    }||dz
           }|r|j        }| j                            t%          ||||	|
|||                     Vd S )Nc                 6    g | ]}|                                 S  )item)r   xs     r%   r   z'VanEncoder.__init__.<locals>.<listcomp>8  s-     
 
 
AFFHH
 
 
r'   r   cpu)r   r   )rB   rC   r   r   r   )r,   r-   r   
ModuleListstagespatch_sizesstrideshidden_sizesdepths
mlp_ratiosr   linspacer   sum	enumeratezipnum_channelsappendr   )r.   r   r   r   r   r   r   drop_path_rates	num_stagerB   rC   rA   r   mlp_expansionr   is_first_stager@   r/   s                    r%   r-   zVanEncoder.__init__0  sQ   mB''(.*&

 
#nQ0Es6=GYGYbghhh
 
 
 clWlFJXXc
 c
 	 	^I^
FK~ '!^N&y1}5K 2$1K)!+#1	 	 	   	 	r'   FTrM   output_hidden_statesreturn_dictr   c                     |rdnd }t          | j                  D ]\  }} ||          }|r||fz   }|st          d ||fD                       S t          ||          S )Nr   c              3      K   | ]}||V  	d S r+   r   )r   vs     r%   	<genexpr>z%VanEncoder.forward.<locals>.<genexpr>_  s"      WWqWWr'   )last_hidden_stater0   )r   r   tupler	   )r.   rM   r   r   all_hidden_statesr   stage_modules          r%   r2   zVanEncoder.forwardP  s     #7@BBD(55 	H 	HOA|'<55L# H$5$G! 	XWW\3D$EWWWWWW-\mnnnnr'   )FT)r5   r6   r7   r8   r   r-   r   r:   r   boolr   r   r	   r2   r<   r=   s   @r%   r   r   +  s         y      F 05&*	o olo 'tno d^	o
 
u44	5o o o o o o o or'   r   c                   0    e Zd ZU dZeed<   dZdZdZd Z	dS )VanPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r   vanpixel_valuesTc                 H   t          |t          j                  r|t          j                            |j        | j        j                   t          |t          j                  r.|j        )t          j        	                    |j        d           dS dS dS t          |t          j
                  rLt          j        	                    |j        d           t          j        	                    |j        d           dS t          |t          j                  r|j        d         |j        d         z  |j        z  }||j        z  }|j        j                            dt#          j        d|z                       |j        "|j        j                                         dS dS dS )zInitialize the weights)stdNr   g      ?r   g       @)
isinstancer   Linearinittrunc_normal_r   r   initializer_rangebias	constant_r   rH   rF   rT   r[   datanormal_mathsqrtzero_)r.   modulefan_outs      r%   _init_weightsz VanPreTrainedModel._init_weightso  ss   fbi(( 	)G!!&-T[5R!SSS&"),, 21H!!&+q111112 21H1H-- 	)Gfk1---GfmS11111	** 	)(+f.@.CCfFYYG%GM&&q$)C'M*B*BCCC{& &&(((((	) 	) '&r'   N)
r5   r6   r7   r8   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r'   r%   r   r   d  sN          
 $O&*#) ) ) ) )r'   r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VanConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zxThe bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding layer.c                        e Zd Z fdZ ee           eeee	de
          	 	 d
deej                 dee         dee         deeef         fd	                        Z xZS )VanModelc                     t                                          |           || _        t          |          | _        t          j        |j        d         |j                  | _	        | 
                                 d S )Nr   r   )r,   r-   r   r   encoderr   r   r   r   	layernorm	post_initr.   r   r/   s     r%   r-   zVanModel.__init__  sh       !&))f&9"&=6CXYYYr'   vision)
checkpointoutput_typeconfig_classmodalityexpected_outputNr   r   r   r   c                    ||n| j         j        }||n| j         j        }|                     |||          }|d         }|                    ddg          }|s||f|dd          z   S t          |||j                  S )Nr   r   r   r   )dimr   )r   pooler_outputr0   )r   r   use_return_dictr   meanr
   r0   )r.   r   r   r   encoder_outputsr   pooled_outputs          r%   r2   zVanModel.forward  s     %9$D  $+Jj 	 &1%<kk$+B],,!5# ' 
 

 ,A.)..B8.<< 	L%}58KKK7/')7
 
 
 	
r'   )NN)r5   r6   r7   r-   r   VAN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr
   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   FloatTensorr   r   r   r2   r<   r=   s   @r%   r   r     s             +*+?@@&<$.   04&*	
 
u01
 'tn
 d^	

 
u>>	?
 
 
  A@
 
 
 
 
r'   r   z
    VAN Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Z fdZ ee           eeee	e
          	 	 	 	 d
deej                 deej                 dee         dee         deeef         f
d	                        Z xZS )VanForImageClassificationc                 *   t                                          |           t          |          | _        |j        dk    r%t          j        |j        d         |j                  nt          j                    | _	        | 
                                 d S )Nr   r   )r,   r-   r   r   
num_labelsr   r   r   r   
classifierr   r   s     r%   r-   z"VanForImageClassification.__init__  s       F## FLEVYZEZEZBIf)"-v/@AAA`b`k`m`m 	
 	r'   )r   r   r   r   Nr   labelsr   r   r   c                 @   ||n| j         j        }|                     |||          }|r|j        n|d         }|                     |          }d}||                     ||| j                   }|s|f|dd         z   }	||f|	z   n|	S t          |||j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   rE   )losslogitsr0   )r   r  r   r  r  loss_functionr   r0   )
r.   r   r  r   r   outputsr  r  r  r$   s
             r%   r2   z!VanForImageClassification.forward  s    ( &1%<kk$+B]((<>R`k(ll1<L--'!*//%%ffdkBBD 	FY,F)-)9TGf$$vE3f\c\qrrrrr'   )NNNN)r5   r6   r7   r-   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r   r
  
LongTensorr   r   r   r2   r<   r=   s   @r%   r  r    s       	 	 	 	 	 +*+?@@*8$4	   59-1/3&*s su01s )*s 'tn	s
 d^s 
u::	;s s s  A@s s s s sr'   r  )r  r   r   )r   F)5r8   r   collectionsr   typingr   r   r   r   activationsr   modeling_outputsr	   r
   r   modeling_utilsr   utilsr   r   r   r   configuration_vanr   
get_loggerr5   loggerr  r  r	  r  r  r:   r9   r   r&   Moduler)   r?   rQ   rf   rp   rv   r   r   r   r   r   VAN_START_DOCSTRINGr  r   r  __all__r   r'   r%   <module>r%     s   4 3  # # # # # # " " " " " " " "        " " " " " "         
 / . . . . . v v v v v v v v v v v v ( ( ( ( ( ( 
	H	%	%  : '  > 1  U\ e T V[Vb    (% % % % %") % % %    ")   (    ")   @    bi   (    29       ry   8    bi   ( ( ( ( (ry ( ( (V( ( ( ( (ry ( ( (V6o 6o 6o 6o 6o 6o 6o 6or) ) ) ) ) ) ) )8	    
-
 -
 -
 -
 -
! -
 -
 
-
`   0s 0s 0s 0s 0s 2 0s 0s 0sf J
I
Ir'   