
     `i              	       |   d Z ddlZddlZddlmZ ddlmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ  ej        e          Ze ed           G d de                                  Ze ed           G d de                                  Ze ed           G d de                                  Z e ed           G d de                                  Z! G d de
j"                  Z# G d de
j"                  Z$dAd"e	j%        d#e&d$e'd%e	j%        fd&Z( G d' d(e
j"                  Z) G d) d*e
j"                  Z* G d+ d,e
j"                  Z+ G d- d.e
j"                  Z, G d/ d0e          Z- G d1 d2e
j"                  Z.e G d3 d4e                      Z/e G d5 d6e/                      Z0 ed7           G d8 d9e/                      Z1 ed:           G d; d<e/                      Z2 ed=           G d> d?e/e                      Z3g d@Z4dS )BzPyTorch FocalNet model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging)BackboneMixin   )FocalNetConfigzC
    FocalNet encoder's outputs, with potential hidden states.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
ej                          ed<   dZee
ej                          ed<   dS )FocalNetEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_statehidden_statesreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   r   %   sr           6:x 129998<M8E%"345<<<AEHU5+<%=>EEEEEr!   r   zZ
    FocalNet model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )FocalNetModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_outputr   r   )r   r   r   r   r   r   r   r   r   r%   r   r   r   r    r!   r"   r$   r$   :   s         	 	 6:x 1299915M8E-.5558<M8E%"345<<<AEHU5+<%=>EEEEEr!   r$   z.
    FocalNet masked image model outputs.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )!FocalNetMaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstructionr   r   )r   r   r   r   r(   r   r   r   r   r)   r   r   r   r    r!   r"   r'   r'   R   s           )-D(5$
%,,,26NHU./6668<M8E%"345<<<AEHU5+<%=>EEEEEr!   r'   z4
    FocalNet outputs for image classification.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )FocalNetImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr(   logitsr   r   )r   r   r   r   r(   r   r   r   r   r,   r   r   r   r    r!   r"   r+   r+   l   s           )-D(5$
%,,,*.FHU&'...8<M8E%"345<<<AEHU5+<%=>EEEEEr!   r+   c                   ~     e Zd ZdZd	 fd	Z	 d
deej                 deej                 de	ej
                 fdZ xZS )FocalNetEmbeddingszX
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    Fc           	         t                                                       t          ||j        |j        |j        |j        |j        d          | _        | j        j	        | _
        |r-t          j        t          j        dd|j                            nd | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr1   r2   r3   r4   r5   patch_embeddings	grid_size
patch_gridr   	Parameterr   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr0   use_mask_token	__class__s      r"   r:   zFocalNetEmbeddings.__init__   s     7((,&!0!
 !
 !
 /9O]g",u{1a9I'J'JKKKcgL!1v7LMMM	z&"<==r!   Npixel_valuesbool_masked_posreturnc                 f   |                      |          \  }}|                     |          }|                                \  }}}|R| j                            ||d          }|                    d                              |          }	|d|	z
  z  ||	z  z   }|                     |          }||fS )N      ?)r<   rD   sizerA   expand	unsqueezetype_asrG   )
rH   rK   rL   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmasks
             r"   forwardzFocalNetEmbeddings.forward   s     )-(=(=l(K(K%
%YYz**
!+!2!2
GQ&/00WbIIK",,R0088EED#sTz2[45GGJ\\*--
,,,r!   )FN)r   r   r   r   r:   r   r   r   
BoolTensorr   Tensorr\   __classcell__rJ   s   @r"   r.   r.      s         > > > > > >& hl- -$U%67-JRSXScJd-	u|	- - - - - - - -r!   r.   c                   x     e Zd Z	 	 	 d fd	Zd Zdeej                 deej	        ee
         f         fdZ xZS )r;   Fc	                    t                                                       t          |t          j        j                  r|n||f}t          |t          j        j                  r|n||f}|d         |d         z  |d         |d         z  z  }	|| _        || _        || _        |	| _	        |d         |d         z  |d         |d         z  f| _
        |r.|rd}
d}d}nd}
d}d}t          j        |||
||          | _        nt          j        ||||          | _        |r"t          j        ||j        	          | _        d S d | _        d S )
Nr   r            r   )kernel_sizestridepadding)rg   rh   r7   )r9   r:   
isinstancecollectionsabcIterabler1   r2   r3   num_patchesr=   r   Conv2d
projectionrB   rC   rD   )rH   r0   r1   r2   r3   r4   add_normr5   r6   rn   rg   ri   rh   rJ   s                r"   r:   z FocalNetPatchEmbeddings.__init__   sq    	#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY 	l  ii[Y`  DOO !iiZ`jkkkDO 	YF4IJJJDIIIDIIIr!   c                 Z   || j         d         z  dk    r@d| j         d         || j         d         z  z
  f}t          j                            ||          }|| j         d         z  dk    rBddd| j         d         || j         d         z  z
  f}t          j                            ||          }|S )Nr   r   )r2   r   
functionalpad)rH   rK   heightwidth
pad_valuess        r"   	maybe_padz!FocalNetPatchEmbeddings.maybe_pad   s    4?1%%**T_Q/%$/!:L2LLMJ=,,\:FFLDOA&&!++Q4?1#5QRAS8S#STJ=,,\:FFLr!   rK   rM   c                 X   |j         \  }}}}|| j        k    rt          d          |                     |||          }|                     |          }|j         \  }}}}||f}|                    d                              dd          }| j        |                     |          }||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.re   r   )shaper3   
ValueErrorrx   rp   flatten	transposerD   )rH   rK   rY   r3   ru   rv   rU   rV   s           r"   r\   zFocalNetPatchEmbeddings.forward   s    )5);&<4,,,w   ~~lFEBB__\22
(.1fe#UO''**44Q::
9 :..J,,,r!   )FFF)r   r   r   r:   rx   r   r   r   r   r_   intr\   r`   ra   s   @r"   r;   r;      s         ( ( ( ( ( (T  -HU->$? -E%,X]^aXbJbDc - - - - - - - -r!   r;           Finput	drop_probtrainingrM   c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)rz   ndimr   randr   r   floor_div)r   r   r   	keep_probrz   random_tensoroutputs          r"   	drop_pathr      s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr!   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
FocalNetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rM   c                 V    t                                                       || _        d S r]   )r9   r:   r   )rH   r   rJ   s     r"   r:   zFocalNetDropPath.__init__  s$    "r!   r   c                 8    t          || j        | j                  S r]   )r   r   r   )rH   r   s     r"   r\   zFocalNetDropPath.forward  s    FFFr!   c                     d| j          S )Nzp=)r   rH   s    r"   
extra_reprzFocalNetDropPath.extra_repr  s    $DN$$$r!   r]   )r   r   r   r   r   floatr:   r   r_   r\   strr   r`   ra   s   @r"   r   r     s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r!   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetModulationre   Tr   c                    t                                                       || _        |j        |         | _        |j        |         | _        || _        |j        | _        |j	        | _	        t          j        |d|z  | j        dz   z   |          | _        t          j        ||dd|          | _        t          j                    | _        t          j        ||          | _        t          j        |          | _        t          j                    | _        g | _        t/          | j                  D ]}| j        |z  | j        z   }| j                            t          j        t          j        |||d||dz  d          t          j                                         | j                            |           | j        r"t          j        ||j                  | _        d S d S )Nre   r   )bias)rg   rh   r   F)rg   rh   groupsri   r   r7   )r9   r:   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inro   projection_contextGELU
activationprojection_outrE   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrB   rC   	layernorm)
rH   r0   indexr   r   r   r   krg   rJ   s
            r"   r:   zFocalNetModulation.__init__  s   "07!.u5(060W-#)#= YsAGt7G!7K,LSWXXX"$)C!ATX"Y"Y"Y')) iS11"$*-?"@"@MOOt'(( 
	2 
	2A+a/$2CCK$$ISk!CYdhiYipu   GII	    $$[11110 	J\#63HIIIDNNN	J 	Jr!   c                 \   |j         d         }|                     |                              dddd                                          }t	          j        |||| j        dz   fd          \  }}}d}t          | j                  D ]/} | j        |         |          }|||dd||dz   f         z  z   }0| 	                    |
                    dd          
                    dd                    }	||	|dd| j        df         z  z   }| j        r|| j        dz   z  }|                     |          }
||
z  }|                    dddd                                          }| j        r|                     |          }|                     |          }|                     |          }|S )	z
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        rO   r   r   r   re   NT)keepdim)rz   r   permute
contiguousr   splitr   r   r   r   meanr   r   r   r   r   r   )rH   hidden_stater3   xqctxgatesctx_alllevel
ctx_global	modulatorx_outs               r"   r\   zFocalNetModulation.forward;  s    $)"- |,,44Q1a@@KKMMAlDDTWXDX'Y[\]]3 4+,, 	B 	BE*$#E*3//CeAAAuuqy/@,@&A AAGG__SXXaX%>%>%C%CAt%C%T%TUU
Jqqq$2B2D2D/D)EEE # 	7!1A!56G ++G44	IaAq))44660 	*NN5))E ##E**''..r!   )re   Tr   r   r   r   r:   r\   r`   ra   s   @r"   r   r     sS        J J J J J JB" " " " " " "r!   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetMlpNr   c                     t                                                       |p|}|p|}t          j        ||          | _        t
          |j                 | _        t          j        ||          | _        t          j	        |          | _
        d S r]   )r9   r:   r   r   fc1r   
hidden_actr   fc2rE   drop)rH   r0   in_featureshidden_featuresout_featuresr   rJ   s         r"   r:   zFocalNetMlp.__init__a  sw    #2{)8[9[/:: !239_l;;Jt$$			r!   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r]   )r   r   r   r   )rH   r   s     r"   r\   zFocalNetMlp.forwardj  s]    xx--|44yy..xx--yy..r!   )NNr   r   ra   s   @r"   r   r   `  sL        % % % % % %      r!   r   c                   *     e Zd ZdZd fd	Zd Z xZS )FocalNetLayera  Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`tuple[int]`):
            Input resolution.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    r   c                     t                                                       || _        || _        || _        |j        | _        |j        | _        t          j	        ||j
                  | _        t          |||| j                  | _        |dk    rt          |          nt          j                    | _        t          j	        ||j
                  | _        t%          ||j        z            }t)          |||| j                  | _        d| _        d| _        |j        rlt          j        |j        t7          j        |          z  d          | _        t          j        |j        t7          j        |          z  d          | _        d S d S )Nr7   )r0   r   r   r   r   )r0   r   r   r   rP   T)requires_grad)r9   r:   r0   r   input_resolutionrF   r   use_post_layernormr   rB   rC   norm1r   
modulationr   Identityr   norm2r~   	mlp_ratior   mlpgamma_1gamma_2use_layerscaler?   layerscale_valuer   ones)rH   r0   r   r   r   r   mlp_hidden_dimrJ   s          r"   r:   zFocalNetLayer.__init__  sm     0 .	"(";\#6+@AAA
,#y	
 
 
 9BC))444R[]]\#6+@AAA
S6#3344f#~dhdmnnn  	g<(?%*S//(QaefffDL<(?%*S//(QaefffDLLL	g 	gr!   c           	      V   |\  }}|j         \  }}}|}| j        r|n|                     |          }|                    ||||          }|                     |                              |||z  |          }| j        s|n|                     |          }||                     | j        |z            z   }||                     | j        | j        r(|                     | 	                    |                    n'| 	                    |                     |                    z            z   }|S r]   )
rz   r   r   viewr   r   r   r   r   r   )	rH   r   input_dimensionsru   rv   rW   rY   r3   shortcuts	            r"   r\   zFocalNetLayer.forward  s0   (&2&8#
A| (,'>\||DJJ|D\D\#((VULQQ|4499*funVbcc+/+B`||

S_H`H`  $..1L"M"MM#dnnL595Lttzz$((<00111RVRZRZ[_[e[efr[s[sRtRtv'
 '
 

 r!   )r   )r   r   r   r   r:   r\   r`   ra   s   @r"   r   r   s  s]         g g g g g g@      r!   r   c                   b     e Zd Z fdZdej        deeef         deej                 fdZ xZ	S )FocalNetStagec           
        	 t                                                       | _        t          j                  | _        fdt          | j                  D             }|         | j        dz
  k     r|dz            nd }| j        dz
  k     rt          nd }d t          j	        dj
        t          j                  d          D             }|t          j        d                    t          j        d dz                               	t          j        	fdt          j                           D                       | _        | |d|d	j        d
          | _        nd | _        d
| _        d S )Nc                 *    g | ]}j         d |z  z  S )re   )r4   ).0ir0   s     r"   
<listcomp>z*FocalNetStage.__init__.<locals>.<listcomp>  s%    OOO1V%A.OOOr!   r   c                 6    g | ]}|                                 S r    )item)r   r   s     r"   r   z*FocalNetStage.__init__.<locals>.<listcomp>  s     lllAqvvxxlllr!   r   cpu)r   c                 r    g | ]3}t          t          t                    r|         n           4S ))r0   r   r   r   r   )r   rj   list)r   r   r0   r   r   r   r   s     r"   r   z*FocalNetStage.__init__.<locals>.<listcomp>  s`     	 	 	  !%5.8D.I.IXilly  	 	 	r!   re   TF)r0   r1   r2   r3   r4   rq   r5   r6   )r9   r:   r0   lendepths
num_stagesr   r;   r   linspacedrop_path_ratesumr   r   layersr5   
downsamplepointing)rH   r0   r   r   r4   out_dimr   dprr   r   rJ   s    ```    @@r"   r:   zFocalNetStage.__init__  s   fm,,OOOOdo8N8NOOO	+04?Q3F+F+F)EAI&&T1619L1L1L,,SW
 ml63H#fmJ\J\ej!k!k!klllFM&5&122S{QR{9S5T5TTU	m	 	 	 	 	 	 	 	 v}U344	 	 	
 
 !(j+ !%4	 	 	DOO #DOr!   r   r   rM   c                    |\  }}| j         D ]} |||          }|}| j        U|\  }}|                    dd                              |j        d         d||          }|                     |          \  }}n||||f}|||f}|S )Nr   re   r   rO   )r   r   r}   reshaperz   )	rH   r   r   ru   rv   layer_module!hidden_states_before_downsamplingrV   stage_outputss	            r"   r\   zFocalNetStage.forward  s    ( K 	J 	JL(L8HIIMM,9)?&,MFE)33Aq99AA17:B M 04}/M/M,M,, "( >&(IK\]r!   )
r   r   r   r:   r   r_   r   r~   r\   r`   ra   s   @r"   r   r     sw        * * * * *XU\ U3PS8_ Y^_d_kYl        r!   r   c                        e Zd Z fdZ	 	 	 ddej        deeef         dee	         dee	         dee	         d	e
eef         fd
Z xZS )FocalNetEncoderc                 
   t                                                       t          j                  | _        | _        t          j        fdt          | j                  D                       | _	        d| _
        d S )Nc           
      h    g | ].}t          |d          d|z  z  d         d|z  z  f          /S )r   re   r   )r0   r   r   )r   )r   i_layerr0   r=   s     r"   r   z,FocalNetEncoder.__init__.<locals>.<listcomp>  se         !!&/lq'z&BIaLUVX_U_D`%a    r!   F)r9   r:   r   r   r   r0   r   r   r   stagesgradient_checkpointing)rH   r0   r=   rJ   s    ``r"   r:   zFocalNetEncoder.__init__  s    fm,,m      %T_55  	
 	
 ',###r!   FTr   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrM   c                    |rdnd }|rdnd }|r?|j         \  }}	}
 |j        |g||
R  }|                    dddd          }||fz  }||fz  }t          | j                  D ]\  }} |||          }|d         }|d         }|d         }|d         |d         f}|rP|rN|j         \  }}	}
 |j        |g|d         |d         f|
R  }|                    dddd          }||fz  }||fz  }|rA|s?|j         \  }}	}
 |j        |g||
R  }|                    dddd          }||fz  }||fz  }|st          d ||fD                       S t          |||	          S )
Nr    r   r   r   re   rO   c              3      K   | ]}||V  	d S r]   r    )r   vs     r"   	<genexpr>z*FocalNetEncoder.forward.<locals>.<genexpr><  s"      XXq!-----XXr!   )r   r   r   )rz   r   r   	enumerater  r   r   )rH   r   r   r	  r
  r  all_hidden_statesall_reshaped_hidden_statesrW   rY   hidden_sizereshaped_hidden_stater   stage_moduler  r   rV   s                    r"   r\   zFocalNetEncoder.forward  s`    #7@BBD+?%IRRT" 	C)6)<&J;$6M$6z$bDT$bVa$b$b$b!$9$A$A!Q1$M$M!-!11&+@*BB&(55 	G 	GOA|(L8HIIM)!,M0=a0@- -a 0 1" 57H7LM# G(P G-N-T*
A{ )O(I(N)"3A"68I!8L!M)OZ) ) )% )>(E(EaAq(Q(Q%!&G%II!*/D.FF**% G.V G-:-@*
A{(:(::(fHX(fZe(f(f(f%(=(E(EaAq(Q(Q%!m%55!*/D.FF* 	YXX]4E$FXXXXXX$++#=
 
 
 	
r!   )FFT)r   r   r   r:   r   r_   r   r~   r   boolr   r   r\   r`   ra   s   @r"   r  r    s        , , , , ,, 05CH&*5
 5
|5
  S/5
 'tn	5

 3;4.5
 d^5
 
u++	,5
 5
 5
 5
 5
 5
 5
 5
r!   r  c                   2    e Zd ZU eed<   dZdZdZdgZd Z	dS )FocalNetPreTrainedModelr0   focalnetrK   Tr   c                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS t          |t                    r)|j         |j        j        
                                 dS dS t          |t                    r`| j        j        rV|j        j                            | j        j                   |j        j                            | j        j                   dS dS dS )zInitialize the weightsr   )r   stdNrP   )rj   r   r   ro   weightdatanormal_r0   initializer_ranger   zero_rB   fill_r.   rA   r   r   r   r   r   )rH   modules     r"   _init_weightsz%FocalNetPreTrainedModel._init_weightsM  sy   fry")455 	H M&&CT[5R&SSS{& &&((((( '&-- 		HK""$$$M$$S))))) 233 	H ,!&,,..... -,.. 	H{) H#))$+*FGGG#))$+*FGGGGG	H 	HH Hr!   N)
r   r   r   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr$  r    r!   r"   r  r  E  sP         "$O&*#()H H H H Hr!   r  c                        e Zd Zd fd	Zd Ze	 	 	 	 ddeej                 deej	                 dee
         d	ee
         d
eeef         f
d            Z xZS )FocalNetModelTFc                    t                                          |           || _        t          |j                  | _        t          |j        d| j        dz
  z  z            | _        t          ||          | _
        t          || j
        j                  | _        t          j        | j        |j                  | _        |rt          j        d          nd| _        |                                  dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        re   r   )rI   r7   N)r9   r:   r0   r   r   r   r~   r4   num_featuresr.   rU   r  r>   encoderr   rB   rC   r   AdaptiveAvgPool1dpooler	post_init)rH   r0   add_pooling_layerrI   rJ   s       r"   r:   zFocalNetModel.__init__c  s     	   fm,, 0119L3M MNN,VNSSS&vt/IJJd&7V=RSSS1BLb*1--- 	r!   c                     | j         j        S r]   )rU   r<   r   s    r"   get_input_embeddingsz"FocalNetModel.get_input_embeddingsx  s    //r!   NrK   rL   r	  r  rM   c                    ||n| j         j        }||n| j         j        }|t          d          |                     ||          \  }}|                     ||||          }|d         }|                     |          }d}	| j        >|                     |                    dd                    }	t          j
        |	d          }	|s||	f|dd         z   }
|
S t          ||	|j        |j                  S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rL   r	  r  r   r   re   )r   r%   r   r   )r0   r	  use_return_dictr{   rU   r-  r   r/  r}   r   r|   r$   r   r   )rH   rK   rL   r	  r  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s              r"   r\   zFocalNetModel.forward{  s1    %9$D  $+Jj 	 &1%<kk$+B]?@@@-1__\[j_-k-k**,,!5#	 ' 
 
 *!,..99;" KK(A(A!Q(G(GHHM!M-;;M 	%}58KKFM"-')7#2#I	
 
 
 	
r!   )TFNNNN)r   r   r   r:   r3  r   r   r   r   r^   r  r   r   r$   r\   r`   ra   s   @r"   r*  r*  a  s             *0 0 0  596:/3&*.
 .
u01.
 "%"23.
 'tn	.

 d^.
 
u))	*.
 .
 .
 ^.
 .
 .
 .
 .
r!   r*  a  
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                        e Zd Z fdZe	 	 	 	 d	deej                 deej                 dee	         dee	         de
eef         f
d            Z xZS )
FocalNetForMaskedImageModelingc                    t                                          |           t          |dd          | _        t	          |j                  | _        t          |j        d| j        dz
  z  z            }t          j
        t          j        ||j        dz  |j        z  d          t          j        |j                            | _        |                                  d S )NFT)r1  rI   re   r   )in_channelsout_channelsrg   )r9   r:   r*  r  r   r   r   r~   r4   r   r   ro   encoder_strider3   PixelShuffledecoderr0  )rH   r0   r,  rJ   s      r"   r:   z'FocalNetForMaskedImageModeling.__init__  s       %fVZ[[[fm,,6+aDOa4G.HHII}I(v7La7ORXRe7est   OF122	
 
 	r!   NrK   rL   r	  r  rM   c                    ||n| j         j        }|                     ||||          }|d         }|                    dd          }|j        \  }}}	t          j        |	dz            x}
}|                    |||
|          }|                     |          }d}|| j         j	        | j         j
        z  }|                    d||          }|                    | j         j
        d                              | j         j
        d                              d                                          }t          j                            ||d	          }||z                                  |                                d
z   z  | j         j        z  }|s|f|dd         z   }||f|z   n|S t'          |||j        |j                  S )a?  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
        >>> config = FocalNetConfig()
        >>> model = FocalNetForMaskedImageModeling(config)

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rL   r	  r  r   r   re   g      ?rO   none)	reductiongh㈵>)r(   r)   r   r   )r0   r6  r  r}   rz   mathfloorr   rC  r1   r2   repeat_interleaverS   r   r   rs   l1_lossr   r3   r'   r   r   )rH   rK   rL   r	  r  outputsr9  rW   r3   sequence_lengthru   rv   reconstructed_pixel_valuesmasked_im_lossrQ   r[   reconstruction_lossr   s                     r"   r\   z&FocalNetForMaskedImageModeling.forward  s   H &1%<kk$+B]--+!5#	   
 
 "!*)33Aq994C4I1
L/OS$8999)11*lFTYZZ &*\\/%B%B"&;)T[-CCD-55b$EEO11$+2H!LL""4;#91==1	  #%-"7"7F`lr"7"s"s1D8==??488::PTCTUX\XcXppN 	Z02WQRR[@F3A3M^%..SYY05!/#*#A	
 
 
 	
r!   r;  )r   r   r   r:   r   r   r   r   r^   r  r   r   r'   r\   r`   ra   s   @r"   r=  r=    s            "  596:/3&*L
 L
u01L
 "%"23L
 'tn	L

 d^L
 
u77	8L
 L
 L
 ^L
 L
 L
 L
 L
r!   r=  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                        e Zd Z fdZe	 	 	 	 d	deej                 deej                 dee	         dee	         de
eef         f
d            Z xZS )
FocalNetForImageClassificationc                 @   t                                          |           |j        | _        t          |          | _        |j        dk    r$t          j        | j        j        |j                  nt          j                    | _	        | 
                                 d S )Nr   )r9   r:   
num_labelsr*  r  r   r   r,  r   
classifierr0  rH   r0   rJ   s     r"   r:   z'FocalNetForImageClassification.__init__%  s        +%f-- IOHY\]H]H]BIdm0&2CDDDcecncpcp 	
 	r!   NrK   labelsr	  r  rM   c                 :   ||n| j         j        }|                     |||          }|d         }|                     |          }d}||                     ||| j                   }|s|f|dd         z   }	||f|	z   n|	S t          |||j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr5  r   re   )r(   r,   r   r   )r0   r6  r  rT  loss_functionr+   r   r   )
rH   rK   rV  r	  r  rK  r:  r,   r(   r   s
             r"   r\   z&FocalNetForImageClassification.forward3  s     &1%<kk$+B]--!5#   
 
  
//%%ffdkBBD 	FY,F)-)9TGf$$vE,!/#*#A	
 
 
 	
r!   r;  )r   r   r   r:   r   r   r   r   
LongTensorr  r   r   r+   r\   r`   ra   s   @r"   rQ  rQ    s              59-1/3&*&
 &
u01&
 )*&
 'tn	&

 d^&
 
u33	4&
 &
 &
 ^&
 &
 &
 &
 &
r!   rQ  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c            
       x     e Zd ZdZdef fdZe	 	 d
dej        de	e
         de	e
         defd	            Z xZS )FocalNetBackboneFr0   c                    t                                          |           t                                          |           |j        g|j        z   | _        t          |          | _        |                                  d S r]   )	r9   r:   _init_backboner4   hidden_sizesr,  r*  r  r0  rU  s     r"   r:   zFocalNetBackbone.__init__e  sp       v&&&#-.1DD%f-- 	r!   NrK   r	  r  rM   c                 @   ||n| j         j        }||n| j         j        }|                     |dd          }|j        }d}t          | j                  D ]\  }}|| j        v r|||         fz  }|s|f}	|r|	|j        fz  }	|	S t          ||r|j        ndd          S )aj  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr5  r    )feature_mapsr   
attentions)
r0   r6  r	  r  r   r  stage_namesr   r   r
   )
rH   rK   r	  r  rK  r   r`  idxstager   s
             r"   r\   zFocalNetBackbone.forwardo  s    2 &1%<kk$+B]$8$D  $+Jj 	 --4UY-ZZ6#D$455 	6 	6JC)))s!3 55 	"_F# 37022M%3GQ'//T
 
 
 	
r!   )NN)r   r   r   has_attentionsr   r:   r   r   r_   r   r  r
   r\   r`   ra   s   @r"   r[  r[  ]  s         N~        04&*	0
 0
l0
 'tn0
 d^	0

 
0
 0
 0
 ^0
 0
 0
 0
 0
r!   r[  )rQ  r=  r[  r*  r  )r   F)5r   collections.abcrk   rG  dataclassesr   typingr   r   r   r   activationsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr   r$   r'   r+   Moduler.   r;   r_   r   r  r   r   r   r   r   r   r  r  r*  r=  rQ  r[  __all__r    r!   r"   <module>rt     s          ! ! ! ! ! ! " " " " " " " "        ! ! ! ! ! ! 9 9 9 9 9 9 . . . . . . - - - - - - 9 9 9 9 9 9 9 9 9 9 1 1 1 1 1 1 2 2 2 2 2 2 
	H	%	%   
F F F F FK F F  F   
F F F F F+ F F  F$   
F F F F F F F  F(   
F F F F FK F F  F(%- %- %- %- %- %- %- %-PD- D- D- D- D-bi D- D- D-P U\ e T V[Vb    *% % % % %ry % % %D D D D D D D DN    ")   &B B B B BBI B B BJ? ? ? ? ?. ? ? ?DH
 H
 H
 H
 H
bi H
 H
 H
V H H H H Ho H H H6 H
 H
 H
 H
 H
+ H
 H
 H
V   _
 _
 _
 _
 _
%< _
 _
 _
D   7
 7
 7
 7
 7
%< 7
 7
 7
t   
>
 >
 >
 >
 >
. >
 >
 
>
B  r!   