
     `iB              	          d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ  ej        e          Zd-dej        dededej        fdZ G d dej                  Z G d dej                   Z! G d dej                  Z" G d dej                  Z# G d dej                  Z$ G d d ej                  Z%e G d! d"e                      Z&e G d# d$e&                      Z' ed%&           G d' d(e&                      Z( ed)&           G d* d+e&e                      Z)g d,Z*dS ).zPyTorch ConvNext model.    )OptionalN)nn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin)can_return_tuple   )ConvNextConfig        Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/convnext/modeling_convnext.py	drop_pathr$   (   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FM    c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
ConvNextDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 V    t                                                       || _        d S N)super__init__r   )selfr   	__class__s     r#   r+   zConvNextDropPath.__init__@   s$    "r%   hidden_statesc                 8    t          || j        | j                  S r)   )r$   r   r   )r,   r.   s     r#   forwardzConvNextDropPath.forwardD   s    FFFr%   c                     d| j          S )Nzp=)r   )r,   s    r#   
extra_reprzConvNextDropPath.extra_reprG   s    $DN$$$r%   r)   )__name__
__module____qualname____doc__r   floatr+   r   Tensorr0   strr2   __classcell__r-   s   @r#   r'   r'   =   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r%   r'   c                   R     e Zd ZdZddd fd
Zdej        dej        f fdZ xZS )	ConvNextLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    ư>channels_lastepsdata_formatc                z     t                      j        |fd|i| |dvrt          d|           || _        d S )NrA   )r?   channels_firstzUnsupported data format: )r*   r+   NotImplementedErrorrB   )r,   normalized_shaperA   rB   kwargsr-   s        r#   r+   zConvNextLayerNorm.__init__Q   sY    )==s=f===AAA%&O+&O&OPPP&r%   featuresr   c                    | j         dk    rR|                    dddd          }t                                          |          }|                    dddd          }n!t                                          |          }|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        rD   r      r   r   )rB   permuter*   r0   )r,   rH   r-   s     r#   r0   zConvNextLayerNorm.forwardW   sw    
 ///''1a33Hwwx00H''1a33HHwwx00Hr%   	r3   r4   r5   r6   r+   r   r8   r0   r:   r;   s   @r#   r=   r=   K   s         
 15/ ' ' ' ' ' ' '           r%   r=   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZ	S )ConvNextEmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    t                                                       t          j        |j        |j        d         |j        |j                  | _        t          |j        d         dd          | _	        |j        | _        d S )Nr   kernel_sizestrider>   rD   r@   )
r*   r+   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsr=   	layernormr,   configr-   s     r#   r+   zConvNextEmbeddings.__init__j   s     "	!4Q!7VEV_e_p!
 !
 !
 +6+>q+AtYijjj"/r%   pixel_valuesr   c                     |j         d         }|| j        k    rt          d          |                     |          }|                     |          }|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rT   
ValueErrorrW   rX   )r,   r[   rT   
embeddingss       r#   r0   zConvNextEmbeddings.forwardr   s^    #)!,4,,,w   **<88
^^J//
r%   )
r3   r4   r5   r6   r+   r   FloatTensorr8   r0   r:   r;   s   @r#   rN   rN   e   si         0 0 0 0 0E$5 %,        r%   rN   c                   H     e Zd ZdZd fd	Zdej        dej        fdZ xZS )ConvNextLayera3  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                 0   t                                                       t          j        ||dd|          | _        t          |d          | _        t          j        |d|z            | _        t          |j
                 | _        t          j        d|z  |          | _        |j        dk    r0t          j        |j        t          j        |          z  d	          nd | _        |d
k    rt%          |          nt          j                    | _        d S )N   r   )rQ   paddinggroupsr>   rA      r   T)requires_gradr   )r*   r+   r   rS   dwconvr=   rX   Linearpwconv1r   
hidden_actactpwconv2layer_scale_init_value	Parameterr   oneslayer_scale_parameterr'   Identityr$   )r,   rZ   dimr$   r-   s       r#   r+   zConvNextLayer.__init__   s    iSa3OOO*3D999ya#g..&+,yS#.. ,q00 L6CHX\]]]] 	"
 9BC))444R[]]r%   rH   r   c                    |}|                      |          }|                    dddd          }|                     |          }|                     |          }|                     |          }|                     |          }| j        
| j        |z  }|                    dddd          }||                     |          z   }|S )Nr   rJ   r   r   )ri   rK   rX   rk   rm   rn   rr   r$   )r,   rH   residuals      r#   r0   zConvNextLayer.forward   s    ;;x((##Aq!Q//>>(++<<))88H%%<<))%11H<H##Aq!Q//dnnX666r%   )r   rL   r;   s   @r#   ra   ra   }   ss         [ [ [ [ [ [         r%   ra   c                   H     e Zd ZdZd fd	Zdej        dej        fdZ xZS )	ConvNextStagea  ConvNeXT stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    rJ   Nc           	         t                                                       |k    s|dk    rBt          j        t	          |dd          t          j        |||          g          | _        nt          j                    | _        pdg|z  t          j        fdt          |          D                       | _        d S )Nr   r>   rD   r@   rP   r   c                 @    g | ]}t          |                    S ))rt   r$   )ra   ).0jrZ   drop_path_ratesout_channelss     r#   
<listcomp>z*ConvNextStage.__init__.<locals>.<listcomp>   s.    iiiWX]6|q?QRRRiiir%   )	r*   r+   r   
ModuleListr=   rS   downsampling_layerrangelayers)	r,   rZ   in_channelsr~   rQ   rR   depthr}   r-   s	    ` `   `r#   r+   zConvNextStage.__init__   s    ,&&&1**&(m%ktIYZZZIk<[Y_```' 'D## ')mooD#):cUU]miiiiii\abg\h\hiii
 
r%   rH   r   c                 Z    | j         D ]} ||          }| j        D ]} ||          }|S r)   )r   r   )r,   rH   layers      r#   r0   zConvNextStage.forward   sH    , 	' 	'EuXHH[ 	' 	'EuXHHr%   )rJ   rJ   rJ   NrL   r;   s   @r#   rx   rx      sm         
 
 
 
 
 
"         r%   rx   c                   L     e Zd Z fdZ	 ddej        dee         defdZ	 xZ
S )ConvNextEncoderc           
         t                                                       t          j                    | _        d t          j        d|j        t          |j	                  d          
                    |j	                  D             }|j        d         }t          |j                  D ]Y}|j        |         }t          ||||dk    rdnd|j	        |         ||                   }| j                            |           |}Zd S )Nc                 6    g | ]}|                                 S  )tolist)r{   xs     r#   r   z,ConvNextEncoder.__init__.<locals>.<listcomp>   s0     
 
 
 HHJJ
 
 
r%   r   cpu)r   rJ   r   )r   r~   rR   r   r}   )r*   r+   r   r   stagesr   linspacedrop_path_ratesumdepthssplitrU   r   
num_stagesrx   append)r,   rZ   r}   prev_chsiout_chsstager-   s          r#   r+   zConvNextEncoder.__init__   s   moo
 
^Av'<c&->P>PY^___eeflfstt
 
 
 &q)v()) 	 	A)!,G!$$EEqqqmA& / 2  E Ku%%%HH	 	r%   Fr.   output_hidden_statesr   c                     |r|gnd }| j         D ]$} ||          }||                    |           %t          ||          S )N)last_hidden_stater.   )r   r   r   )r,   r.   r   all_hidden_stateslayer_modules        r#   r0   zConvNextEncoder.forward   se     0DM]OO K 	8 	8L(L77M ,!((777-]noooor%   )F)r3   r4   r5   r+   r   r8   r   boolr   r0   r:   r;   s   @r#   r   r      s            , SX
p 
p"\
pAI$
p	'
p 
p 
p 
p 
p 
p 
p 
pr%   r   c                   2    e Zd ZU eed<   dZdZdgZi Zd Z	dS )ConvNextPreTrainedModelrZ   convnextr[   ra   c                 L   t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j        t          f          r?|j	        j        
                                 |j        j                            d           dS t          |t                    r2|j        -|j        j                            | j        j                   dS dS dS )zInitialize the weightsr   )meanstdNg      ?)
isinstancer   rj   rS   weightdatanormal_rZ   initializer_rangebiaszero_	LayerNormr=   fill_ra   rr   ro   )r,   modules     r#   _init_weightsz%ConvNextPreTrainedModel._init_weights   s   fry")455 	\ M&&CT[5R&SSS{& &&((((( '&/@ ABB 	\K""$$$M$$S))))).. 	\+7,1778Z[[[[[	\ 	\77r%   N)
r3   r4   r5   r   __annotations__base_model_prefixmain_input_name_no_split_modules_can_record_outputsr   r   r%   r#   r   r      sP         "$O()\ \ \ \ \r%   r   c            	       x     e Zd Z fdZee	 ddeej                 dee	         de
fd                        Z xZS )ConvNextModelc                 &   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        d         |j
                  | _        |                                  d S )Nrf   )r*   r+   rZ   rN   r^   r   encoderr   r   rU   layer_norm_epsrX   	post_initrY   s     r#   r+   zConvNextModel.__init__	  s{       ,V44&v.. f&9"&=6CXYYY 	r%   Nr[   r   r   c                 (   || j         j        }|t          d          |                     |          }|                     ||          }|j        }|                     |                    ddg                    }t          |||j	                  S )Nz You have to specify pixel_valuesr   r   )r   pooler_outputr.   )
rZ   r   r]   r^   r   r   rX   r   r	   r.   )r,   r[   r   embedding_outputencoder_outputsr   pooled_outputs          r#   r0   zConvNextModel.forward  s    
  '#';#C ?@@@??<88:>,,3G ;G ;
 ;
 ,= '8'='=r2h'G'GHH7/')7
 
 
 	
r%   NN)r3   r4   r5   r+   r   r   r   r   r_   r   r	   r0   r:   r;   s   @r#   r   r     s             gk
 
$U%67
V^_cVd
	1
 
 
 ^ 
 
 
 
 
r%   r   z
    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc            	            e Zd ZdZ fdZee	 ddeej	                 deej
                 defd                        Z xZS )	ConvNextForImageClassificationFc                 N   t                                          |           |j        | _        t          |          | _        |j        dk    r+t          j        |j        d         |j                  | _        nt          j	                    | _        | 
                                 d S )Nr   r   )r*   r+   
num_labelsr   r   r   rj   rU   
classifierrs   r   rY   s     r#   r+   z'ConvNextForImageClassification.__init__:  s        +%f-- q   i(;B(?ARSSDOO kmmDO 	r%   Nr[   labelsr   c                      | j         |fi |}|j        }|                     |          }d}||                     ||| j                  }t          |||j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   pooled_logitsrZ   )losslogitsr.   )r   r   r   loss_functionrZ   r
   r.   )r,   r[   r   rG   outputsr   r   r   s           r#   r0   z&ConvNextForImageClassification.forwardI  s     =JDM,<a<aZ`<a<a-//%%V6RVR]%^^D3!/
 
 
 	
r%   r   )r3   r4   r5   accepts_loss_kwargsr+   r   r   r   r   r_   
LongTensorr
   r0   r:   r;   s   @r#   r   r   1  s               ei
 
$U%67
HPQVQaHb
	-
 
 
 ^ 
 
 
 
 
r%   r   zQ
    ConvNeXt backbone, to be used with frameworks like DETR and MaskFormer.
    c            	       p     e Zd ZdZ fdZee	 ddej        de	e
         defd                        Z xZS )	ConvNextBackboneFc                    t                                          |           t                                          |           t          |          | _        t          |          | _        |j        d         g|j        z   | _        i }t          | j
        | j                  D ]\  }}t          |d          ||<   t          j        |          | _        |                                  d S )Nr   rD   )rB   )r*   r+   _init_backbonerN   r^   r   r   rU   num_featureszip_out_featureschannelsr=   r   
ModuleDicthidden_states_normsr   )r,   rZ   r   r   rT   r-   s        r#   r+   zConvNextBackbone.__init__k  s       v&&&,V44&v..#034v7JJ !#&t'94=#I#I 	g 	gE<):<Ue)f)f)f&&#%=1D#E#E  	r%   Nr[   r   r   c                 j   || j         j        }|                     |          }|                     |d          }|j        }g }t          | j        |          D ]9\  }}|| j        v r+ | j        |         |          }|	                    |           :t          t          |          |r|nd          S )ah  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   )feature_mapsr.   )rZ   r   r^   r   r.   r   stage_namesout_featuresr   r   r   tuple)	r,   r[   r   r   r   r.   r   r   hidden_states	            r#   r0   zConvNextBackbone.forward|  s    2  '#';#C ??<88<@LLIYptL<u<u-#&t'7#G#G 	2 	2E<)))>t7>|LL##L111|,,+?I--T
 
 
 	
r%   r)   )r3   r4   r5   has_attentionsr+   r   r   r   r8   r   r   r   r0   r:   r;   s   @r#   r   r   c  s         N    "  04'
 '
l'
 'tn'
 
	'
 '
 '
 ^ '
 '
 '
 '
 '
r%   r   )r   r   r   r   )r   F)+r6   typingr   r   r   activationsr   modeling_outputsr   r   r	   r
   modeling_utilsr   utilsr   r   utils.backbone_utilsr   utils.genericr   configuration_convnextr   
get_loggerr3   loggerr8   r7   r   r$   Moduler'   r   r=   rN   ra   rx   r   r   r   r   r   __all__r   r%   r#   <module>r      sx                  ! ! ! ! ! !            . - - - - - , , , , , , , , 1 1 1 1 1 1 - - - - - - 2 2 2 2 2 2 
	H	%	% U\ e T V[Vb    *% % % % %ry % % %       4       0( ( ( ( (BI ( ( (V! ! ! ! !BI ! ! !H p  p  p  p  pbi  p  p  pF \ \ \ \ \o \ \ \. &
 &
 &
 &
 &
+ &
 &
 &
R   )
 )
 )
 )
 )
%< )
 )
 )
X   
=
 =
 =
 =
 =
. =
 =
 
=
@ m
l
lr%   