
     `itH              	          d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ  ej        e          Zd/dej        dededej        fdZ G d dej                  Z G d dej                  Z  G d dej!                  Z" G d dej                  Z# G d dej                  Z$ G d d ej                  Z% G d! d"ej                  Z&e G d# d$e                      Z'e G d% d&e'                      Z( ed'(           G d) d*e'                      Z) ed+(           G d, d-e'e                      Z*g d.Z+dS )0zPyTorch ConvNextV2 model.    )OptionalN)nn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin)can_return_tuple   )ConvNextV2Config        Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_pathr$   (   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FM    c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 V    t                                                       || _        d S N)super__init__r   )selfr   	__class__s     r#   r+   zConvNextV2DropPath.__init__@   s$    "r%   hidden_statesc                 8    t          || j        | j                  S r)   )r$   r   r   )r,   r.   s     r#   forwardzConvNextV2DropPath.forwardD   s    FFFr%   c                     d| j          S )Nzp=)r   )r,   s    r#   
extra_reprzConvNextV2DropPath.extra_reprG   s    $DN$$$r%   r)   )__name__
__module____qualname____doc__r   floatr+   r   Tensorr0   strr2   __classcell__r-   s   @r#   r'   r'   =   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r%   r'   c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )ConvNextV2GRNz)GRN (Global Response Normalization) layerdimc                     t                                                       t          j        t	          j        ddd|                    | _        t          j        t	          j        ddd|                    | _        d S )Nr   )r*   r+   r   	Parameterr   zerosweightbias)r,   r>   r-   s     r#   r+   zConvNextV2GRN.__init__N   s_    l5;q!Q#<#<==LQ1c!:!:;;			r%   r.   r   c                     t           j                            |ddd          }||                    dd          dz   z  }| j        ||z  z  | j        z   |z   }|S )N   )r   rE   T)ordr>   keepdim)r>   rG   ư>)r   linalgvector_normmeanrB   rC   )r,   r.   global_featuresnorm_featuress       r#   r0   zConvNextV2GRN.forwardS   si    ,22=aV]a2bb'?+?+?BPT+?+U+UX\+\]}}'DE	QTaar%   )
r3   r4   r5   r6   intr+   r   FloatTensorr0   r:   r;   s   @r#   r=   r=   K   sr        33<C < < < < < <
U%6 5;L        r%   r=   c                   R     e Zd ZdZddd fd
Zdej        dej        f fdZ xZS )	ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    rI   channels_lastepsdata_formatc                z     t                      j        |fd|i| |dvrt          d|           || _        d S )NrU   )rS   channels_firstzUnsupported data format: )r*   r+   NotImplementedErrorrV   )r,   normalized_shaperU   rV   kwargsr-   s        r#   r+   zConvNextV2LayerNorm.__init__c   sY    )==s=f===AAA%&O+&O&OPPP&r%   featuresr   c                    | j         dk    rR|                    dddd          }t                                          |          }|                    dddd          }n!t                                          |          }|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        rX   r   rE   r   r   )rV   permuter*   r0   )r,   r\   r-   s     r#   r0   zConvNextV2LayerNorm.forwardi   sw    
 ///''1a33Hwwx00H''1a33HHwwx00Hr%   	r3   r4   r5   r6   r+   r   r8   r0   r:   r;   s   @r#   rR   rR   ]   s         
 15/ ' ' ' ' ' ' '           r%   rR   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZ	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    t                                                       t          j        |j        |j        d         |j        |j                  | _        t          |j        d         dd          | _	        |j        | _        d S )Nr   kernel_sizestriderI   rX   rT   )
r*   r+   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsrR   	layernormr,   configr-   s     r#   r+   zConvNextV2Embeddings.__init__}   s     "	!4Q!7VEV_e_p!
 !
 !
 -V-@-C[klll"/r%   pixel_valuesr   c                     |j         d         }|| j        k    rt          d          |                     |          }|                     |          }|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rg   
ValueErrorrj   rk   )r,   rn   rg   
embeddingss       r#   r0   zConvNextV2Embeddings.forward   s^    #)!,4,,,w   **<88
^^J//
r%   )
r3   r4   r5   r6   r+   r   rP   r8   r0   r:   r;   s   @r#   ra   ra   x   si         0 0 0 0 0E$5 %,        r%   ra   c                   H     e Zd ZdZd fd	Zdej        dej        fdZ xZS )ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    t                                                       t          j        ||dd|          | _        t          |d          | _        t          j        |d|z            | _        t          |j
                 | _        t          d|z            | _        t          j        d|z  |          | _        |dk    rt          |          nt          j                    | _        d S )N   r   )rd   paddinggroupsrI   rU      r   )r*   r+   r   rf   dwconvrR   rk   Linearpwconv1r   
hidden_actactr=   grnpwconv2r'   Identityr$   )r,   rm   r>   r$   r-   s       r#   r+   zConvNextV2Layer.__init__   s    iSa3OOO,Sd;;;ya#g..&+, S))yS#..:Cc//+I666r{}}r%   r\   r   c                    |}|                      |          }|                    dddd          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|                    dddd          }||                     |          z   }|S )Nr   rE   r   r   )rz   r^   rk   r|   r~   r   r   r$   )r,   r\   residuals      r#   r0   zConvNextV2Layer.forward   s    ;;x((##Aq!Q//>>(++<<))88H%%88H%%<<))##Aq!Q//dnnX666r%   )r   r_   r;   s   @r#   rs   rs      ss         
] 
] 
] 
] 
] 
]         r%   rs   c                   H     e Zd ZdZd fd	Zdej        dej        fdZ xZS )	ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    rE   Nc           	         t                                                       |k    s|dk    rBt          j        t	          |dd          t          j        |||          g          | _        nt          j                    | _        pdg|z  t          j        fdt          |          D                       | _        d S )Nr   rI   rX   rT   rc   r   c                 @    g | ]}t          |                    S ))r>   r$   )rs   ).0jrm   drop_path_ratesout_channelss     r#   
<listcomp>z,ConvNextV2Stage.__init__.<locals>.<listcomp>   s/    kkkYZ_VQRASTTTkkkr%   )	r*   r+   r   
ModuleListrR   rf   downsampling_layerrangelayers)	r,   rm   in_channelsr   rd   re   depthr   r-   s	    ` `   `r#   r+   zConvNextV2Stage.__init__   s    ,&&&1**&(m'K[\\\Ik<[Y_```' 'D## ')mooD#):cUU]mkkkkkk^cdi^j^jkkk
 
r%   r\   r   c                 Z    | j         D ]} ||          }| j        D ]} ||          }|S r)   )r   r   )r,   r\   layers      r#   r0   zConvNextV2Stage.forward   sH    , 	' 	'EuXHH[ 	' 	'EuXHHr%   )rE   rE   rE   Nr_   r;   s   @r#   r   r      sm         
 
 
 
 
 
"         r%   r   c                   L     e Zd Z fdZ	 ddej        dee         defdZ	 xZ
S )ConvNextV2Encoderc           
         t                                                       t          j                    | _        d t          j        d|j        t          |j	                  d          
                    |j	                  D             }|j        d         }t          |j                  D ]Y}|j        |         }t          ||||dk    rdnd|j	        |         ||                   }| j                            |           |}Zd S )Nc                 6    g | ]}|                                 S  )tolist)r   xs     r#   r   z.ConvNextV2Encoder.__init__.<locals>.<listcomp>   s0     
 
 
 HHJJ
 
 
r%   r   cpu)r   rE   r   )r   r   re   r   r   )r*   r+   r   r   stagesr   linspacedrop_path_ratesumdepthssplitrh   r   
num_stagesr   append)r,   rm   r   prev_chsiout_chsstager-   s          r#   r+   zConvNextV2Encoder.__init__   s   moo
 
^Av'<c&->P>PY^___eeflfstt
 
 
 &q)v()) 	 	A)!,G#$$EEqqqmA& / 2  E Ku%%%HH	 	r%   Fr.   output_hidden_statesr   c                     |r|gnd }| j         D ]$} ||          }||                    |           %t          ||          S )N)last_hidden_stater.   )r   r   r   )r,   r.   r   all_hidden_stateslayer_modules        r#   r0   zConvNextV2Encoder.forward   se     0DM]OO K 	8 	8L(L77M ,!((777-]noooor%   )F)r3   r4   r5   r+   r   r8   r   boolr   r0   r:   r;   s   @r#   r   r      s            , SX
p 
p"\
pAI$
p	'
p 
p 
p 
p 
p 
p 
p 
pr%   r   c                   .    e Zd ZU eed<   dZdZdgZd ZdS )ConvNextV2PreTrainedModelrm   
convnextv2rn   rs   c                 `   t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j        t          f          r?|j	        j        
                                 |j        j                            d           dS t          |t                    r>|j        j        
                                 |j	        j        
                                 dS dS )zInitialize the weightsr   )rL   stdNg      ?)
isinstancer   r{   rf   rB   datanormal_rm   initializer_rangerC   zero_	LayerNormrR   fill_r=   )r,   modules     r#   _init_weightsz'ConvNextV2PreTrainedModel._init_weights  s   fry")455 	% M&&CT[5R&SSS{& &&((((( '&/B CDD 	%K""$$$M$$S))))).. 	%M$$&&&K""$$$$$	% 	%r%   N)	r3   r4   r5   r   __annotations__base_model_prefixmain_input_name_no_split_modulesr   r   r%   r#   r   r     sE         $$O*+% % % % %r%   r   c            	       x     e Zd Z fdZee	 ddeej                 dee	         de
fd                        Z xZS )ConvNextV2Modelc                 &   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        d         |j
                  | _        |                                  d S )NrH   rx   )r*   r+   rm   ra   rq   r   encoderr   r   rh   layer_norm_epsrk   	post_initrl   s     r#   r+   zConvNextV2Model.__init__  s{       .v66(00 f&9"&=6CXYYY 	r%   Nrn   r   r   c                 (   || j         j        }|t          d          |                     |          }|                     ||          }|j        }|                     |                    ddg                    }t          |||j	                  S )Nz You have to specify pixel_valuesr   rH   )r   pooler_outputr.   )
rm   r   rp   rq   r   r   rk   rL   r	   r.   )r,   rn   r   embedding_outputencoder_outputsr   pooled_outputs          r#   r0   zConvNextV2Model.forward+  s    
  '#';#C ?@@@??<88:>,,3G ;G ;
 ;
 ,= '8'='=r2h'G'GHH7/')7
 
 
 	
r%   NN)r3   r4   r5   r+   r   r   r   r   rP   r   r	   r0   r:   r;   s   @r#   r   r     s             gk
 
$U%67
V^_cVd
	1
 
 
 ^ 
 
 
 
 
r%   r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc            	            e Zd ZdZ fdZee	 ddeej	                 deej
                 defd                        Z xZS )	 ConvNextV2ForImageClassificationFc                 N   t                                          |           |j        | _        t          |          | _        |j        dk    r+t          j        |j        d         |j                  | _        nt          j	                    | _        | 
                                 d S )Nr   rH   )r*   r+   
num_labelsr   r   r   r{   rh   
classifierr   r   rl   s     r#   r+   z)ConvNextV2ForImageClassification.__init__P  s        +)&11 q   i(;B(?ARSSDOO kmmDO 	r%   Nrn   labelsr   c                      | j         |fi |}|j        }|                     |          }d}||                     ||| j                  }t          |||j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   pooled_logitsrm   )losslogitsr.   )r   r   r   loss_functionrm   r
   r.   )r,   rn   r   r[   outputsr   r   r   s           r#   r0   z(ConvNextV2ForImageClassification.forward_  s     =LDOL<c<c\b<c<c-//%%V6RVR]%^^D3!/
 
 
 	
r%   r   )r3   r4   r5   accepts_loss_kwargsr+   r   r   r   r   rP   
LongTensorr
   r0   r:   r;   s   @r#   r   r   F  s               ei
 
$U%67
HPQVQaHb
	-
 
 
 ^ 
 
 
 
 
r%   r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c            	       p     e Zd ZdZ fdZee	 ddej        de	e
         defd                        Z xZS )	ConvNextV2BackboneFc                    t                                          |           t                                          |           t          |          | _        t          |          | _        |j        d         g|j        z   | _        i }t          | j
        | j                  D ]\  }}t          |d          ||<   t          j        |          | _        |                                  d S )Nr   rX   )rV   )r*   r+   _init_backbonera   rq   r   r   rh   num_featureszip_out_featureschannelsrR   r   
ModuleDicthidden_states_normsr   )r,   rm   r   r   rg   r-   s        r#   r+   zConvNextV2Backbone.__init__  s       v&&&.v66(00#034v7JJ !#&t'94=#I#I 	i 	iE<)<\Wg)h)h)h&&#%=1D#E#E  	r%   Nrn   r   r   c                 j   || j         j        }|                     |          }|                     |d          }|j        }g }t          | j        |          D ]9\  }}|| j        v r+ | j        |         |          }|	                    |           :t          t          |          |r|nd          S )ar  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   )feature_mapsr.   )rm   r   rq   r   r.   r   stage_namesout_featuresr   r   r   tuple)	r,   rn   r   r   r   r.   r   r   hidden_states	            r#   r0   zConvNextV2Backbone.forward  s    2  '#';#C ??<88<@LLIYptL<u<u-#&t'7#G#G 	2 	2E<)))>t7>|LL##L111|,,+?I--T
 
 
 	
r%   r)   )r3   r4   r5   has_attentionsr+   r   r   r   r8   r   r   r   r0   r:   r;   s   @r#   r   r   y  s         N    "  04'
 '
l'
 'tn'
 
	'
 '
 '
 ^ '
 '
 '
 '
 '
r%   r   )r   r   r   r   )r   F),r6   typingr   r   r   activationsr   modeling_outputsr   r   r	   r
   modeling_utilsr   utilsr   r   utils.backbone_utilsr   utils.genericr   configuration_convnextv2r   
get_loggerr3   loggerr8   r7   r   r$   Moduler'   r=   r   rR   ra   rs   r   r   r   r   r   r   __all__r   r%   r#   <module>r      s                   ! ! ! ! ! !            . - - - - - , , , , , , , , 1 1 1 1 1 1 - - - - - - 6 6 6 6 6 6 
	H	%	% U\ e T V[Vb    *% % % % % % % %    BI   $    ",   6    29   0( ( ( ( (bi ( ( (X! ! ! ! !bi ! ! !J p  p  p  p  p	  p  p  pF % % % % % % % %, &
 &
 &
 &
 &
/ &
 &
 &
R   )
 )
 )
 )
 )
'@ )
 )
 )
X   =
 =
 =
 =
 =
2M =
 =
 =
@ u
t
tr%   