
     `i              	       t   d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'  e"j(        e)          Z*e e!d           G d de                                  Z+dQde
j        de,de-de
j        fdZ. G d dej/                  Z0 G d dej/                  Z1 G d  d!ej/                  Z2 G d" d#ej/                  Z3 G d$ d%e3          Z4 G d& d'ej/                  Z5e3e4d(Z6 G d) d*ej/                  Z7 G d+ d,ej/                  Z8 G d- d.ej/                  Z9 G d/ d0e          Z: G d1 d2ej/                  Z; G d3 d4ej/                  Z<e! G d5 d6e                      Z=e! G d7 d8e=                      Z> G d9 d:ej/                  Z? e!d;           G d< d=e=                      Z@ e!d>           G d? d@e=                      ZA G dA dBej/                  ZB G dC dDej/                  ZC G dE dFej/                  ZD G dG dHej/                  ZE G dI dJej/                  ZFe! G dK dLe=                      ZG e!dM           G dN dOe=e%                      ZHg dPZIdS )RzPyTorch BEiT model.    N)	dataclass)OptionalUnion)Tensornn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int)BackboneMixin   )
BeitConfigz-
    Class for outputs of [`BeitModel`].
    )custom_introc                       e Zd ZdZdS )BeitModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__     z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/beit/modeling_beit.pyr   r   /   s           r$   r           Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r&   r   r   r   )dtypedevice)shapendimtorchrandr-   r.   floor_div)r'   r(   r)   	keep_probr/   random_tensoroutputs          r%   	drop_pathr8   >   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr$   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr(   r*   c                 V    t                                                       || _        d S N)super__init__r(   )selfr(   	__class__s     r%   r>   zBeitDropPath.__init__U   s$    "r$   hidden_statesc                 8    t          || j        | j                  S r<   )r8   r(   r)   r?   rA   s     r%   forwardzBeitDropPath.forwardY   s    FFFr$   c                     d| j          S )Nzp=)r(   r?   s    r%   
extra_reprzBeitDropPath.extra_repr\   s    $DN$$$r$   r<   )r   r    r!   r"   r   floatr>   r1   r   rD   strrG   __classcell__r@   s   @r%   r:   r:   R   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r$   r:   c            	            e Zd ZdZdeddf fdZdej        dededej        fd	Z		 	 dd
ej        de
ej                 de
e         dej        fdZ xZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr*   Nc                    t                                                       t          j        t	          j        dd|j                            | _        |j        r3t          j        t	          j        dd|j                            | _	        nd | _	        t          |          | _        |j        | _        t          |j        t          j        j                  r|j        n|j        |j        f| _        | j        j        }|j        r6t          j        t	          j        d|dz   |j                            | _        nd | _        t          j        |j                  | _        d S )Nr   )r=   r>   r   	Parameterr1   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r?   rN   r^   r@   s      r%   r>   zBeitEmbeddings.__init__h   s(   ek!Q8J&K&KLL  	# l5;q!V=O+P+PQQDOO"DO 3F ; ; + &+[_-EFF8F#V%67 	
 +72 	,')|EK;QR?TZTf4g4g'h'hD$$'+D$z&"<==r$   
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r	      bicubicFsizemodealign_cornersdim)r/   r`   r1   jit
is_tracingrX   r   reshapepermuter   
functionalinterpolateviewcat)r?   rd   re   rf   r^   num_positionsclass_pos_embedpatch_pos_embedrp   
new_height	new_widthsqrt_num_positionss               r%   interpolate_pos_encodingz'BeitEmbeddings.interpolate_pos_encoding   sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr$   pixel_valuesbool_masked_posr   c                 B   | j         |t          j        d           |j        \  }}}}|                     |          \  }\  }}	|                                \  }
}}|R| j                            |
|d          }|                    d          	                    |          }|d|z
  z  ||z  z   }| j
                            |
dd          }t          j        ||fd          }| j         ||                     |||          z   }|                     |          }|||	ffS )Nz`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always interpolated to the input image size. The argument will be removed in transformers v4.51.0.rh   r   ro   )r`   warningswarnr/   rW   rl   rU   expand	unsqueezetype_asrS   r1   rx   r   rc   )r?   r   r   r   _re   rf   rd   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r%   rD   zBeitEmbeddings.forward   sD    #/4L4XMn  
 +01fe262G2G2U2U/
/\;!+!2!2
GQ&/00WbIIK))"--55kBBA#q1u-a?J^**:r2>>
Y
J7Q???
#/#d&C&CJPVX]&^&^^J\\*--
L+666r$   NN)r   r    r!   r"   r   r>   r1   r   intr   r   
BoolTensorboolrD   rJ   rK   s   @r%   rM   rM   b   s         
>z >d > > > > > >.&D5< &D &DUX &D]b]i &D &D &D &DV 7;37	7 7l7 "%"237 #+4.	7
 
7 7 7 7 7 7 7 7r$   rM   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )rV   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|d         |d         z  |d         |d         z  f}|| _        || _        || _        || _
        || _        t          j        ||||          | _        d S )Nr   r   kernel_sizestride)r=   r>   rZ   rX   num_channelsrR   rY   r[   r\   r]   r^   patch_shaper   Conv2d
projection)	r?   rN   rZ   rX   r   rR   r^   r   r@   s	           r%   r>   zBeitPatchEmbeddings.__init__   s   !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&)L+:^hiiir$   r   r*   c                 
   |j         \  }}}}|| j        k    rt          d          |                     |          }|j         d         |j         d         }}|                    d                              dd          }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.ri   r	   r   )r/   r   
ValueErrorr   flatten	transpose)	r?   r   r   r   re   rf   rd   r   r   s	            r%   rD   zBeitPatchEmbeddings.forward   s    2>2D/
L&%4,,,w   __\22
$.$4Q$79I!9Lk''**44Q::
L+666r$   )	r   r    r!   r"   r>   r1   r   rD   rJ   rK   s   @r%   rV   rV      sm         j j j j j"7EL 7U\ 7 7 7 7 7 7 7 7r$   rV   c                       e Zd Zddedee         ddf fdZ	 	 	 	 	 ddej        deej                 d	e	d
eej                 de	deee
                  deeej                 eej        ej        f         f         fdZ xZS )BeitSelfAttentionNrN   window_sizer*   c                    t                                                       || _        |j        |j        z  dk    r0t          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	        d          | _        t          j        |j        | j	                  | _        t          j        |j                  | _        t%          |          | _        | j        rt)          ||          | _        d S d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r=   r>   rN   rR   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluera   attention_probs_dropout_probrc   r   has_relative_position_biasBeitRelativePositionBiasrelative_position_biasr?   rN   r   r@   s      r%   r>   zBeitSelfAttention.__init__   se    ::a??PVXhHiHi?76#5 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1C%PPPYv143EFF
z&"EFF*.{*;*;'* 	d*B6Wb*c*c*cD'''	d 	dr$   FrA   	head_maskoutput_attentionsr   r   
resolutionc                 r   |j         \  }}}	|                     |                              |d| j        | j                                      dd          }
|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }t          j	        |
|                    dd                    }|t          j        | j                  z  }| j        rI|\  }}|| j        j        z  || j        j        z  f}||                     |||j         d                   z   }|||z   }t           j                            |d          }|                     |          }|||z  }t          j	        ||          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||fn|f}|S )	Nrh   r   ri   dim_sizero   r   r	   )r/   r   rw   r   r   r   r   r   r1   matmulmathsqrtr   rN   rX   r   r   ru   softmaxrc   rt   
contiguousrl   r   )r?   rA   r   r   r   r   r   r   
seq_lengthr   query_layer	key_layervalue_layerattention_scoresre   rf   r   attention_probscontext_layernew_context_layer_shapeoutputss                        r%   rD   zBeitSelfAttention.forward  sG    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 !<Y5H5HR5P5PQQ+di8P.Q.QQ * 	&MFE!T[%;;UdkF\=\]K/$2M2M5@STU@V 3N 3 3  
 "-/2HH -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r$   r<   NFNFN)r   r    r!   r   r   tupler>   r1   r   r   r   r   rD   rJ   rK   s   @r%   r   r      s       d dz d dSW d d d d d d4 -1"'9=).+/> >|> EL)>  	>
 !) 6> #'> U3Z(> 
uU\"E%,*D$EE	F> > > > > > > >r$   r   c                        e Zd Z	 	 	 	 	 ddej        deej                 dedeej                 dedeee                  d	e	eej                 eej        ej        f         f         f fd
Z
 xZS )BeitSdpaSelfAttentionNFrA   r   r   r   r   r   r*   c           	      ~   |s|At                               d           t                                          ||||||          S |j        \  }}}	|                     |                              |d| j        | j                  	                    dd          }
| 
                    |                              |d| j        | j                  	                    dd          }|                     |                              |d| j        | j                  	                    dd          }d }| j        rF|\  }}|| j        j        z  || j        j        z  f}|                     |||j        d                   }|
||}n||z  }dt!          j        | j                  z  }t$          j        j                            |
|||| j        r| j        j        ndd|	          }|                    d
ddd                                          }|                                d d         | j        fz   } |j        | }|d fS )Na  `BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rA   r   r   r   r   r   rh   r   ri   r   r&   F)	attn_mask	dropout_p	is_causalscaler   r	   r   )loggerwarning_oncer=   rD   r/   r   rw   r   r   r   r   r   r   rN   rX   r   r   r   r1   r   ru   scaled_dot_product_attentionr)   r   rt   r   rl   r   )r?   rA   r   r   r   r   r   r   r   r   r   r   r   	attn_biasre   rf   r   scalingr   r   r@   s                       r%   rD   zBeitSdpaSelfAttention.forwardH  sp     		 5w   77??+#"3'=)A% #    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 	* 	&MFE!T[%;;UdkF\=\]K335@STU@V 4  I
 "- 2		33	di 8999+HHBF-Xdk>>UX I 
 
 &--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDd""r$   r   )r   r    r!   r1   r   r   r   r   r   r   rD   rJ   rK   s   @r%   r   r   G  s         -1"'9=).+/F# F#|F# EL)F#  	F#
 !) 6F# #'F# U3Z(F# 
uU\"E%,*D$EE	FF# F# F# F# F# F# F# F# F# F#r$   r   c                   `     e Zd ZdZdeddf fdZd	dej        dej        dej        fdZ xZ	S )
BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rN   r*   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r<   )	r=   r>   r   r   rR   densera   rb   rc   r?   rN   r@   s     r%   r>   zBeitSelfOutput.__init__  sJ    Yv163EFF
z&"<==r$   rA   input_tensorc                 Z    |                      |          }|                     |          }|S r<   r   rc   )r?   rA   r   gammas       r%   rD   zBeitSelfOutput.forward  *    

=11]33r$   r<   )
r   r    r!   r"   r   r>   r1   r   rD   rJ   rK   s   @r%   r   r     s         
>z >d > > > > > >
 U\  ^c^j        r$   r   )eagersdpac                       e Zd Zddedee         ddf fdZd Z	 	 	 	 	 ddej	        d	eej	                 d
e
deej	                 de
deee                  deeej	                 eej	        ej	        f         f         fdZ xZS )BeitAttentionNrN   r   r*   c                     t                                                       t          |j                 ||          | _        t          |          | _        t                      | _        d S )Nr   )	r=   r>   BEIT_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r7   setpruned_headsr   s      r%   r>   zBeitAttention.__init__  sX    4V5PQRXfqrrr$V,,EEr$   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   ro   )lenr   r   r   r   r   r   r   r   r   r7   r   r   union)r?   headsindexs      r%   prune_headszBeitAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r$   FrA   r   r   r   r   r   c                     |                      ||||||          }|                     |d         |          }|f|dd          z   }	|	S )Nr   r   )r   r7   )
r?   rA   r   r   r   r   r   self_outputsattention_outputr   s
             r%   rD   zBeitAttention.forward  s]     ~~9&79OQiku
 
  ;;|AFF#%QRR(88r$   r<   r   )r   r    r!   r   r   r   r>   r   r1   r   r   r   r   rD   rJ   rK   s   @r%   r   r     s       " "z " "SW " " " " " "; ; ;* -1"'9=).+/ | EL)  	
 !) 6 #' U3Z( 
uU\"E%,*D$EE	F       r$   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )BeitIntermediaterN   r*   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r<   )r=   r>   r   r   rR   intermediate_sizer   rY   
hidden_actrI   r
   intermediate_act_fnr   s     r%   r>   zBeitIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r$   rA   c                 Z    |                      |          }|                     |          }|S r<   )r   r   rC   s     r%   rD   zBeitIntermediate.forward  s,    

=1100??r$   	r   r    r!   r   r>   r1   r   rD   rJ   rK   s   @r%   r   r     sq        9z 9d 9 9 9 9 9 9U\ el        r$   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )
BeitOutputrN   r*   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r<   )
r=   r>   r   r   r   rR   r   ra   rb   rc   r   s     r%   r>   zBeitOutput.__init__  sJ    Yv79KLL
z&"<==r$   rA   c                 Z    |                      |          }|                     |          }|S r<   r   rC   s     r%   rD   zBeitOutput.forward  r   r$   r   rK   s   @r%   r   r     sq        >z >d > > > > > >
U\ el        r$   r   c                       e Zd ZdZddedee         deddf fdZ	 	 	 	 	 dd
e	j
        dee	j
                 dedee	j
                 dedeeeef                  deee	j
                 ee	j
        e	j
        f         f         fdZ xZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.Nr&   rN   r   drop_path_rater*   c                    t                                                       |j        | _        d| _        t	          ||          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        |dk    rt          |          nt          j                    | _        t          j        |j        |j                  | _        |j        }|dk    rlt          j        |t+          j        |j                  z  d          | _        t          j        |t+          j        |j                  z  d          | _        d S d\  | _        | _        d S )	Nr   r   epsr&   r   T)requires_gradr   )r=   r>   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r7   r   	LayerNormrR   layer_norm_epslayernorm_beforer:   Identityr8   layernorm_afterlayer_scale_init_valuerP   r1   oneslambda_1lambda_2)r?   rN   r   r  init_valuesr@   s        r%   r>   zBeitLayer.__init__  s:   '-'E$&v;GGG,V44 (( "V-?VEZ [ [ [9G#9M9Mn555SUS^S`S`!|F,>FDYZZZ3??Luz&BT7U7U)UeijjjDMLuz&BT7U7U)UeijjjDMMM+5(DM4===r$   FrA   r   r   r   r   r   c                    |                      |                     |          |||||          }|d         }|dd          }	| j        
| j        |z  }|                     |          |z   }|                     |          }
|                     |
          }
|                     |
          }
| j        
| j        |
z  }
|                     |
          |z   }
|
f|	z   }	|	S )N)r   r   r   r   r   r   )r   r  r  r8   r  r  r7   r  )r?   rA   r   r   r   r   r   self_attention_outputsr   r   layer_outputs              r%   rD   zBeitLayer.forward  s     "&!!-00/#9%=! "0 "
 "
 2!4(, =$#}/?? '788=H ++M::((66{{<00=$=<7L ~~l33mC/G+r$   )Nr&   r   )r   r    r!   r"   r   r   r   rH   r>   r1   r   r   r   r   rD   rJ   rK   s   @r%   r  r    s       II6 6z 6 6`e 6pt 6 6 6 6 6 6* -1"'9=).04) )|) EL))  	)
 !) 6) #') U38_-) 
uU\"E%,*D$EE	F) ) ) ) ) ) ) )r$   r  c                        e Zd Zdededdf fdZ ed          deeef         dej	        fd            Z
dd
edej	        fdZ xZS )r   rN   r   r*   Nc                    t                                                       || _        d|d         z  dz
  d|d         z  dz
  z  dz   | _        t	          j        t          j        | j        |j                            | _	        d S )Nri   r   r   r	   )
r=   r>   r   num_relative_distancer   rP   r1   rQ   r   relative_position_bias_tabler   s      r%   r>   z!BeitRelativePositionBias.__init__4  s    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LK2F4NOO-
 -
)))r$   
   )maxsizec                 ^   d|d         z  dz
  d|d         z  dz
  z  dz   }|d         |d         z  }t          j        t          j        |d                   t          j        |d                   d          }t          j        |          }t          j        |d          }|dddddf         |dddddf         z
  }|                    ddd                                          }|dddddfxx         |d         dz
  z  cc<   |dddddfxx         |d         dz
  z  cc<   |dddddfxx         d|d         z  dz
  z  cc<   t          j        |dz   fdz  |j                  }|	                    d	          |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        ri   r   r   r	   ij)indexingN)rl   r-   rh   )r   r   )
r1   meshgridarangestackr   rt   r   rQ   r-   sum)	r?   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r%    generate_relative_position_indexz9BeitRelativePositionBias.generate_relative_position_index=  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~el;q>::ELUV<X<XcghhhT""vq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   KNQ$66   111a   KNQ$66   111a   AA$6$::   "'+K!O3E3IQ`Qf"g"g"g*9*=*=b*A*AABB')>)B122&)>)BA&(=(A%&&r$   Fr   c                 0   d| j         d         z  dz
  }d| j         d         z  dz
  }d|d         z  dz
  }d|d         z  dz
  }| j        }| j        }	||z  dz   }
|d|	dz
           }|                    d||d                              dddd          }t
          j                            |t          |          t          |          fd          }|                    dddd                              |
dz
  d          }t          j
        |||	dz
  d         g          }|                     |          }||                    d                   }|                    |d         |d         z  dz   |d         |d         z  dz   d          }|                    ddd                                          }|rKt
          j                            |                    d          ||fdd	
                              d          }|                    d          S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        ri   r   r   r	   Nrh   bilinear)rl   rm   Frk   )r   r  r  rs   rt   r   ru   rv   r   r1   rx   r,  rw   r   r   squeeze)r?   r   r   r   
old_height	old_widthr|   r}    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler+  r   s                   r%   rD   z BeitRelativePositionBias.forwardV  s`    )!,,q0
(++a/	Q'!+
A&*	+/+L($($>!$.$:Q$>!89X;TWX;X9XY%--aJKKSSTUWXZ[]^__11:!6!6	)8L8L MT^ 2 
 
 &--aAq99AAB[^_B_acdd+09<=VYZ=Z=\=\]^,
 ,
( #'"G"G"T"T!ABYB^B^_aBbBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "
 "8!?!?1a!H!H!S!S!U!U# 	%']%>%>&0033)#	 &? & &
 gajj # &//222r$   )FN)r   r    r!   r   r   r>   r   r   r1   r   r,  r   rD   rJ   rK   s   @r%   r   r   3  s        
z 
 
$ 
 
 
 
 
 
 )(444'E#s(O 'PUP\ ' ' ' 54'0-3 -3T -3]b]i -3 -3 -3 -3 -3 -3 -3 -3r$   r   c                        e Zd Zddedee         ddf fdZ	 	 	 	 	 	 ddej        d	eej                 d
e	de	de	deee
e
f                  de	deeef         fdZ xZS )BeitEncoderNrN   r   r*   c                    t                                                       | _        j        | _        | j        rt                    | _        d t          j        dj	        j
        d          D             t          j        fdt          j
                  D                       | _        d| _        d S )Nr   c                 6    g | ]}|                                 S r#   )item).0xs     r%   
<listcomp>z(BeitEncoder.__init__.<locals>.<listcomp>  s     rrrAqvvxxrrrr$   r   cpu)r.   c                 R    g | ]#}t          j        rnd |                   $S )N)r   r  )r  use_relative_position_bias)r=  irN   dprr   s     r%   r?  z(BeitEncoder.__init__.<locals>.<listcomp>  sT         /5/P ZVZ#&q6    r$   F)r=   r>   rN   !use_shared_relative_position_biasr   r   r   r1   linspacer  num_hidden_layersr   
ModuleListrangelayergradient_checkpointing)r?   rN   r   rD  r@   s    ``@r%   r>   zBeitEncoder.__init__  s    *0*R'* 	d*B6Wb*c*c*cD' sr63H&Jbkp!q!q!qrrr]      v788  	
 	

 ',###r$   FTrA   r   r   output_hidden_statesr   r   return_dictc           	         |rdnd }|rdnd }	t          | j                  D ]\  }
}|r||fz   }| j        rG|\  }}|| j        j        z  || j        j        z  f}|                     |||j        d                   }nd }|||
         nd } |||||||          }|d         }|r|	|d         fz   }	|r||fz   }|st          d |||	fD                       S t          |||	          S )Nr#   r   )r   r   )r   r   r   r   r   r   c              3      K   | ]}||V  	d S r<   r#   )r=  vs     r%   	<genexpr>z&BeitEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr$   )last_hidden_staterA   
attentions)		enumeraterJ  r   rN   rX   r   r/   r   r   )r?   rA   r   r   rL  r   r   rM  all_hidden_statesall_self_attentionsrC  layer_modulere   rf   r   r   layer_head_masklayer_outputss                     r%   rD   zBeitEncoder.forward  s    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!. . *%)??$+J`A`a)-)D)D:R]j]pqr]s *E * *&& *.&.7.CillO(L)"3'=)A%  M *!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r$   r<   )NFFFNT)r   r    r!   r   r   r   r>   r1   r   r   r   r   r   rD   rJ   rK   s   @r%   r9  r9    s        , ,z , ,SW , , , , , ,0 -1"'%*).04 3
 3
|3
 EL)3
  	3

 #3
 #'3
 U38_-3
 3
 
uo%	&3
 3
 3
 3
 3
 3
 3
 3
r$   r9  c                   <    e Zd ZU eed<   dZdZdZdgZdgZ	dZ
d ZdS )	BeitPreTrainedModelrN   beitr   Tr  z.*relative_position_index.*c                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r_|j        j                            d| j        j	                   |j        +|j        j        |j                                                  dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS t          |t                     rl|j        j                                         |j        |j        j                                         |j         |j        j                                         dS dS t          |t(                    r |j        j                                         dS t          |t,                    r[|j        V|j        j                            | j        j                   |j        j                            | j        j                   dS dS dS )zInitialize the weightsr&   )meanstdNg      ?)rY   r   r   r   ConvTranspose2dweightdatanormal_rN   initializer_ranger   zero_	Embeddingpadding_idxr  fill_rM   rS   rU   r`   r   r  r  r  r  r  )r?   modules     r%   _init_weightsz!BeitPreTrainedModel._init_weights  sJ   fry")R5GHII 	O M&&CT[5R&SSS{& &&((((( '&-- 	OM&&CT[5R&SSS!-"6#56<<>>>>> .--- 	OK""$$$M$$S)))))// 	O!''))) ,!&,,...)5*/5577777 65 899 	O/4::<<<<<	** 	O*$**4;+MNNN$**4;+MNNNNN	O 	O**r$   N)r   r    r!   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdparj  r#   r$   r%   r[  r[    s]         $O&*#$*H)I&NO O O O Or$   r[  c                        e Zd Zddededdf fdZd Zd Ze	 	 	 	 	 	 dd
e	j
        dee	j                 dee	j
                 dee         dee         dedee         deeef         fd            Z xZS )	BeitModelTrN   add_pooling_layerr*   Nc                    t                                          |           || _        t          |          | _        t          || j        j        j                  | _        |j	        rt          j                    nt          j        |j        |j                  | _        |rt!          |          nd| _        |                                  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   r  N)r=   r>   rN   rM   rd   r9  rW   r   encoderuse_mean_poolingr   r  r  rR   r  	layernorm
BeitPoolerpooler	post_init)r?   rN   rt  r@   s      r%   r>   zBeitModel.__init__  s    
 	   (00"6t7W7cddd $4uBKMMM",vGY_e_t:u:u:u 	 ->Gj(((4 	r$   c                     | j         j        S r<   rd   rW   rF   s    r%   get_input_embeddingszBeitModel.get_input_embeddings      //r$   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrv  rJ  r   r   )r?   heads_to_prunerJ  r   s       r%   _prune_headszBeitModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr$   Fr   r   r   r   rL  r   rM  c           	         ||n| j         j        }||n| j         j        }||n| j         j        }|                     || j         j                  }|                     ||          \  }}	|j        dd         }
|                     |||||
||          }|d         }| 	                    |          }| j
        | 
                    |          nd}|s|||fn|f}||dd         z   S t          |||j        |j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   ri   )r   r   rL  r   rM  r   r   r   )rR  pooler_outputrA   rS  )rN   r   rL  use_return_dictget_head_maskrG  rd   r/   rv  rx  rz  r   rA   rS  )r?   r   r   r   r   rL  r   rM  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r%   rD   zBeitModel.forward  sW    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] &&y$+2OPP	"oolOo\\!!'+
,,/!5!#%= ' 
 
 *!,..998<8OO444UY 	6?L?XO];;_n^pL/!"""555)-')7&1	
 
 
 	
r$   )T)NNNNFN)r   r    r!   r   r   r>   r~  r  r   r1   r   r   r   r   r   r   rD   rJ   rK   s   @r%   rs  rs    s'        z d d      &0 0 0C C C  7;,0,0/3).&*4
 4
l4
 "%"234
 EL)	4

 $D>4
 'tn4
 #'4
 d^4
 
u00	14
 4
 4
 ^4
 4
 4
 4
 4
r$   rs  c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )ry  rN   r*   Nc                     t                                                       |j        r t          j        |j        |j                  nd | _        d S )Nr  )r=   r>   rw  r   r  rR   r  rx  r   s     r%   r>   zBeitPooler.__init__R  sJ    KQKblBL+1FGGGGhl 	r$   rA   c                     | j         :|d d dd d d f         }|                      |                    d                    }n|d d df         }|S )Nr   r   )rx  r^  )r?   rA   patch_tokensr  s       r%   rD   zBeitPooler.forwardX  sa    >%(ABB2L NN<+<+<Q+?+?@@MM *!!!Q$/Mr$   r   rK   s   @r%   ry  ry  Q  sq        
z 
d 
 
 
 
 
 
	U\ 	el 	 	 	 	 	 	 	 	r$   ry  a  
    Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
    c                       e Zd Zdeddf fdZd Ze	 	 	 	 	 	 	 	 ddeej	                 deej
                 d	eej	                 d
eej	                 dee         dee         dedee         deeef         fd            Z xZS )BeitForMaskedImageModelingrN   r*   Nc                 H   t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _        |                                  d S )NFrt  r  )r=   r>   
num_labelsrs  r\  r   r  rR   r  rx  r   
vocab_sizelm_headr{  r   s     r%   r>   z#BeitForMaskedImageModeling.__init__m  s        +f>>>	 f&8f>STTTy!3V5FGG 	r$   c                     d S r<   r#   rF   s    r%   get_output_embeddingsz0BeitForMaskedImageModeling.get_output_embeddingsz  s    tr$   Fr   r   r   labelsr   rL  r   rM  c	           	         ||n| j         j        }|                     |||||||          }	|	d         }
|                     |
          }
|                     |
ddddf                   }d}| t                      } |||         |          }|s|f|	dd         z   }||f|z   n|S t          |||	j        |	j                  S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)r   r   r   rL  r   rM  r   r   losslogitsrA   rS  )	rN   r  r\  rx  r  r   r   rA   rS  )r?   r   r   r   r  r   rL  r   rM  r   r  prediction_scoresmasked_lm_lossloss_fctr7   s                  r%   rD   z"BeitForMaskedImageModeling.forward}  s   X &1%<kk$+B]))+/!5%=#  
 
 "!*..99 LLABB)?@@'))H%X&7&H&QQN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r$   )NNNNNNFN)r   r    r!   r   r>   r  r   r   r1   r   r   r   r   r   r   rD   rJ   rK   s   @r%   r  r  d  s6       z d          046:,0)-,0/3).&*I
 I
u|,I
 "%"23I
 EL)	I

 &I
 $D>I
 'tnI
 #'I
 d^I
 
un$	%I
 I
 I
 ^I
 I
 I
 I
 I
r$   r  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZe	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	ee	         d
ee	         de	dee	         de
eef         fd            Z xZS )BeitForImageClassificationrN   r*   Nc                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S )NTr  r   )r=   r>   r  rs  r\  r   r   rR   r  
classifierr{  r   s     r%   r>   z#BeitForImageClassification.__init__  s        +f===	 OUN_bcNcNc")F$68IJJJikitiviv 	r$   Fr   r   r  r   rL  r   rM  c                 R   ||n| j         j        }|                     ||||||          }|r|j        n|d         }	|                     |	          }
d}||                     ||
| j                   }|s|
f|dd         z   }||f|z   n|S t          ||
|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   rL  r   rM  r   ri   r  )	rN   r  r\  r  r  loss_functionr   rA   rS  )r?   r   r   r  r   rL  r   rM  r   r  r  r  r7   s                r%   rD   z"BeitForImageClassification.forward  s    " &1%<kk$+B]))/!5%=#  
 
 2=L--'!*//%%ffdkBBD 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r$   NNNNNFN)r   r    r!   r   r>   r   r   r1   r   r   r   r   r   rD   rJ   rK   s   @r%   r  r    s       
z 
d 
 
 
 
 
 
  04,0)-,0/3).&*+
 +
u|,+
 EL)+
 &	+

 $D>+
 'tn+
 #'+
 d^+
 
u++	,+
 +
 +
 ^+
 +
 +
 +
 +
r$   r  c                        e Zd ZdZ	 	 	 ddededeeeeef         f         deeeeef         ef         d	ed
eeeeef         f         ddf fdZ	de
j        de
j        fdZ xZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   in_channelsout_channelsr   paddingr   dilationr*   Nc                     t                                                       t          j        ||||||          | _        t          j        |          | _        t          j                    | _        d S )N)r  r  r   r  r   r  )	r=   r>   r   r   convBatchNorm2dbnReLU
activation)r?   r  r  r   r  r   r  r@   s          r%   r>   zBeitConvModule.__init__  si     	I#%#
 
 
	 ...'))r$   r'   c                     |                      |          }|                     |          }|                     |          }|S r<   )r  r  r  )r?   r'   r7   s      r%   rD   zBeitConvModule.forward)  s8    5!!((r$   )r   Fr   )r   r    r!   r"   r   r   r   rI   r   r>   r1   r   rD   rJ   rK   s   @r%   r  r    s          5601$ $$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$ $ $ $ $ $*U\ el        r$   r  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZS )	BeitPyramidPoolingBlock
pool_scaler  channelsr*   Nc                    t                                                       t          j        |          t	          ||d          g| _        t          | j                  D ](\  }}|                     t          |          |           )d S )Nr   r   )	r=   r>   r   AdaptiveAvgPool2dr  layersrT  
add_modulerI   )r?   r  r  r  rC  rJ  r@   s         r%   r>   z BeitPyramidPoolingBlock.__init__2  s     ,,;a@@@
 "$+.. 	+ 	+HAuOOCFFE****	+ 	+r$   r'   c                 4    |}| j         D ]} ||          }|S r<   )r  )r?   r'   hidden_staterJ  s       r%   rD   zBeitPyramidPoolingBlock.forward;  s/    [ 	/ 	/E 5..LLr$   )	r   r    r!   r   r>   r1   r   rD   rJ   rK   s   @r%   r  r  1  s        +3 +S +C +D + + + + + +U\ el        r$   r  c            
       x     e Zd ZdZdeedf         dedededdf
 fd	Zd
ej	        de
ej	                 fdZ xZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rn   r*   Nc                 V   t                                                       || _        || _        || _        || _        g | _        t          |          D ]T\  }}t          |||          }| j        	                    |           | 
                    t          |          |           Ud S )N)r  r  r  )r=   r>   r  rn   r  r  blocksrT  r  appendr  rI   )	r?   r  r  r  rn   rC  r  blockr@   s	           r%   r>   z!BeitPyramidPoolingModule.__init__P  s    &*& &{33 	+ 	+MAz+z{emnnnEKu%%%OOCFFE****	+ 	+r$   r>  c                     g }| j         D ]d} ||          }t          j                            ||                                dd          d| j                  }|                    |           e|S )Nri   r.  rk   )r  r   ru   rv   rl   rn   r  )r?   r>  ppm_outsppmppm_outupsampled_ppm_outs         r%   rD   z BeitPyramidPoolingModule.forward\  s{    ; 	/ 	/Cc!ffG " 9 9affhhqrrl4K] !: ! ! OO-....r$   )r   r    r!   r"   r   r   r   r>   r1   r   listrD   rJ   rK   s   @r%   r  r  B  s         
+E#s(O 
+# 
+QT 
+ei 
+nr 
+ 
+ 
+ 
+ 
+ 
+ $u|*<        r$   r  c                   V     e Zd ZdZdeddf fdZd Zdej        dej        fdZ	 xZ
S )	BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rN   r*   Nc                    t                                                       |j        | _        |j        gdz  | _        |j        | _        d| _        t          j        | j        |j	        d          | _
        t          | j        | j        d         | j        | j                  | _        t          | j        d         t          | j                  | j        z  z   | j        dd          | _        t          j                    | _        t          j                    | _        | j        d d         D ]j}t          || j        d          }t          | j        | j        dd          }| j                            |           | j                            |           kt          t          | j                  | j        z  | j        dd          | _        d S )	N   Fr   r  rh   )rn   r	   r   r  )r=   r>   r  rR   r  r  rn   r   r   r  r  r  psp_modulesr  r   
bottleneckrH  lateral_convs	fpn_convsr  fpn_bottleneck)r?   rN   r  l_convfpn_convr@   s        r%   r>   zBeitUperHead.__init__o  s   !-"./!3*")DM63DRSTTT 4R M,	
 
 
 )R 3t'7#8#84=#HHM	
 
 
  ]__+CRC0 	, 	,K#KANNNF%dmT]PQ[\]]]H%%f---N!!(++++, !!DM1M	
 
 
r$   c                     |d         }|g}|                     |                     |                     t          j        |d          }|                     |          }|S )Nrh   r   ro   )extendr  r1   rx   r  )r?   inputsr>  psp_outsr7   s        r%   psp_forwardzBeitUperHead.psp_forward  s\    2J3((++,,,9X1---**r$   encoder_hidden_statesc                 B    fdt           j                  D                                                                           t	                    }t          |dz
  dd          D ]Z}|dz
           j        dd          }|dz
           t          j        	                    |         |d j
                  z   |dz
  <   [ fdt          |dz
            D             }|                    d                    t          |dz
  dd          D ]F}t          j        	                    ||         |d         j        dd          d j
                  ||<   Gt          j        |d	          }                     |          }                     |          }|S )
Nc                 8    g | ]\  }} ||                   S r#   r#   )r=  rC  lateral_convr  s      r%   r?  z(BeitUperHead.forward.<locals>.<listcomp>  s-    pppq,LL!6q!9::pppr$   r   r   rh   ri   r.  rk   c                 H    g | ]} j         |         |                   S r#   )r  )r=  rC  lateralsr?   s     r%   r?  z(BeitUperHead.forward.<locals>.<listcomp>  s/    \\\q%DN1%hqk22\\\r$   ro   )rT  r  r  r  r   rI  r/   r   ru   rv   rn   r1   rx   r  r  )r?   r  used_backbone_levelsrC  
prev_shapefpn_outsr7   r  s   ``     @r%   rD   zBeitUperHead.forward  s   ppppR[\`\nRoRoppp(()>??@@@  #8}}+a/B77 	 	A!!a%.qrr2J&q1uo0I0I*:TM_ 1J 1 1 HQUOO
 ]\\\\EBVYZBZ<[<[\\\%%%+a/B77 	 	A-33(1+"3ABB"7jX\Xj 4  HQKK 9X1---$$X..((r$   )r   r    r!   r"   r   r>   r  r1   r   rD   rJ   rK   s   @r%   r  r  g  s         $
z $
d $
 $
 $
 $
 $
 $
L  U\ el        r$   r  c                        e Zd ZdZ	 ddedededeeeeef         f         d	d
f
 fdZde	j
        d	e	j
        fdZ xZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    ri   r	   r   rN   in_indexr   r  r*   Nc           
         t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        || _
        |dz  |z  }g }|                    t          | j        | j        |||                     t          | j        dz
            D ]3}|                    t          | j        | j        |||                     4| j        dk    rt          j                    | _        nt          j        | | _        | j	        r-t          | j        | j        z   | j        ||dz            | _        t          j        | j        |j        d          | _        d S )Nri   )r   r  r  r   r   r  r  )r=   r>   rR   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  rI  r   r  convs
Sequentialconv_catr   r  r  )	r?   rN   r  r   r  conv_paddingr  rC  r@   s	           r%   r>   zBeitFCNHead.__init__  s    	!-13"9 #q(H4 $-[R^iq  	
 	
 	

 t~)** 	 	ALLM4=kS_jr     
 >QDJJ.DJ 	* 4=0$-[bmqrbr  DM )DM63DRSTTTr$   r  c                     || j                  }|                     |          }| j        r+|                     t	          j        ||gd                    }|                     |          }|S )Nr   ro   )r  r  r  r  r1   rx   r  )r?   r  rA   r7   s       r%   rD   zBeitFCNHead.forward  sf    -dm<M** 	N]]59mV-D!#L#L#LMMF((r$   )ri   r	   r   )r   r    r!   r"   r   r   r   r   r>   r1   r   rD   rJ   rK   s   @r%   r  r    s          tu U  U  U,/ UBE UUZ[^`efiknfn`o[oUp U	 U  U  U  U  U  UDU\ el        r$   r  c                        e Zd Zdeddf fdZd Ze	 	 	 	 	 	 	 ddeej	                 deej	                 d	eej	                 d
ee
         dee
         de
dee
         deeef         fd            Z xZS )BeitForSemanticSegmentationrN   r*   Nc                 P   t                                          |           |j        | _        t          |d          | _        t          | j        j                  dk    rt          d          t          j
        t          j        |j        |j        dd          t          j        |j                  t          j                    t          j        |j        |j        dd                    | _        t          j
        t          j        |j        |j        dd                    | _        t          j                    | _        t          j        dd          | _        t+          |          | _        |j        rt1          |          nd | _        |                                  d S )NFr  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.ri   r   )r=   r>   r  rs  r\  r   rN   out_indicesr   r   r  r`  rR   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headr{  r   s     r%   r>   z$BeitForSemanticSegmentation.__init__  sw       +f>>>	 t{&''1,,-  
 Mv163EST]^___N6-..GIIv163EST]^___	
 
	 Mv163EST]^___
 
	 KMM	LQq999	 (//5;5NXk&111TX 	r$   c                 Z   t           j                            ||j        dd          dd          }|0t           j                            ||j        dd          dd          }t	          | j        j                  } |||          }|}| |||          }	|| j        j        |	z  z  }|S )Nr   r.  Frk   )ignore_index)r   ru   rv   r/   r   rN   semantic_loss_ignore_indexauxiliary_loss_weight)
r?   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r%   compute_lossz(BeitForSemanticSegmentation.compute_loss  s    =44bcc*5 5 
 
 ')+)B)B v|BCC'8zY^ *C * *& $1WXXXH-v66	'%X&@&IINDK5FFDr$   Fr   r   r  r   rL  r   rM  c                 \    ||n j         j        }||n j         j        }| j         j        dk    rt	          d                               |||d||          }|r|j        n|d         }	 fdt          |	          D             }
|j        d          j         j	         j         j
        z  fd|
D             }
 j         j         j         j        g}t          t!          |
                    D ]} ||         |
|                   |
|<                        |
          }d} j                             |
          }d}|                     |||          }|s)|r|f|dd         z   }n|f|d	d         z   }||f|z   n|S t)          |||r|j        nd|j        
          S )aD  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  c                 <    g | ]\  }}|d z   j         j        v |S r,   )rN   r  )r=  idxfeaturer?   s      r%   r?  z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>c  s5    wwwWTWZ[T[_c_j_vTvTvGTvTvTvr$   r   c                     g | ]>}|d d dd d d f                              ddd                              d          ?S )Nr   r   ri   rh   )rt   rs   )r=  r>  r   patch_resolutions     r%   r?  z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>f  sd     
 
 
ijAaaaQQQhK1a((00RAQScdd
 
 
r$   ri   r  )rN   r  rL  r  r   r\  rA   rT  r/   rZ   rX   r  r  r  r   rI  r   r  r  r  r   rS  )r?   r   r   r  r   rL  r   rM  r   r  featuresopsrC  r  r  r  r7   r   r  s   `                @@r%   rD   z#BeitForSemanticSegmentation.forward,  sB   D &1%<kk$+B]$8$D  $+Jj 	 $+"8A"="=NOOO))/!%%=#  
 
 :E T 5 5'RS* xwww	:O0P0Pwww!'*
;1T[5KK
 
 
 
 
nv
 
 

 y$)TY	:s8}}%% 	. 	.A #a&!--HQKK!!(++*#228<<$$V-=vFFD 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T)	
 
 
 	
r$   r  )r   r    r!   r   r>   r  r   r   r1   r   r   r   r   r   rD   rJ   rK   s   @r%   r  r    s"       z d      @  &  04,0)-,0/3).&*X
 X
u|,X
 EL)X
 &	X

 $D>X
 'tnX
 #'X
 d^X
 
u--	.X
 X
 X
 ^X
 X
 X
 X
 X
r$   r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                   |     e Zd Z fdZd Ze	 	 	 d
dedee         dee         dee         de	f
d	            Z
 xZS )BeitBackbonec                    t                                                     t                                                     fdt          j        dz             D             | _        t                    | _        t          | j        j	        j
                  | _        j        rt          | j        j                  dk    rt!          d          j        }t%          j        t%          j        ||dd          t%          j        |j                  t%          j                    t%          j        ||dd                    | _        t%          j        t%          j        ||dd                    | _        t%          j                    | _        t%          j        dd          | _        |                                  d S )	Nc                     g | ]	}j         
S r#   )rR   )r=  r   rN   s     r%   r?  z)BeitBackbone.__init__.<locals>.<listcomp>  s    ]]]AV/]]]r$   r   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.ri   r   r  )r=   r>   _init_backbonerI  rG  num_featuresrM   rd   r9  rW   r   rv  add_fpnr   rN   r  r   rR   r   r  r`  r  batch_norm_epsr  r  r  r  r  r  r   r{  )r?   rN   rR   r@   s    ` r%   r>   zBeitBackbone.__init__  s      v&&&]]]]v?WZ[?[9\9\]]](00"6t7W7cddd> 	>4;*++q00 1  
 !,K";STUUU{0EFFF		";STUUU	 DI b&8k_`ij&k&k&kllDIDI1===DI 	r$   c                     | j         j        S r<   r}  rF   s    r%   r~  z!BeitBackbone.get_input_embeddings  r  r$   Nr   rL  r   rM  r*   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|j        d         }|                     |          \  }\  }}|j        dd         }	|                     |d||	|          }
|r|
j        n|
d         }d}t          | j	        |          D ]`\  }}|| j
        v rR| j         j        r@|ddddddf         }|                    ddd          }|                    |d||          }||fz  }a| j         j        ry|                     |d                   |                     |d                   |                     |d                   |                     |d	                   g}t'          |          }|s!|r|f|
dd         z   }n|f|
dd         z   }|S t)          ||r|
j        nd|
j        
          S )a:  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   ri   T)rL  r   r   rM  r   r#   rh   r	   )feature_mapsrA   rS  )rN   r  rL  r   r/   rd   rv  rA   zipstage_namesout_featuresreshape_hidden_statesrt   rs   r  r  r  r  r   r   r   rS  )r?   r   rL  r   rM  r   r  r   r   r   r   rA   r   stager  r7   s                   r%   rD   zBeitBackbone.forward  s>   @ &1%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq!'*
8<8U8U55<!'+
,,!%/!#  
 
 2=L--'!*#&t'7#G#G 	0 	0E<)));4 c#/122qqq#9L#/#7#71a#@#@L#/#7#7
BVa#b#bL/; 	/		,q/**		,q/**		,q/**		,q/**	L !..L 	# 7&7122;6&7122;6M%3GQ'//T)
 
 
 	
r$   )NNN)r   r    r!   r>   r~  r   r   r   r   r   rD   rJ   rK   s   @r%   r  r    s            <0 0 0  04,0&*Q
 Q
Q
 'tnQ
 $D>	Q

 d^Q
 
Q
 Q
 Q
 ^Q
 Q
 Q
 Q
 Q
r$   r  )r  r  r  rs  r[  r  )r&   F)Jr"   collections.abcr[   r   r   dataclassesr   typingr   r   r1   r   r   torch.nnr   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   utils.backbone_utilsr   configuration_beitr   
get_loggerr   r   r   rH   r   r8   Moduler:   rM   rV   r   r   r   r   r   r   r   r  r   r9  r[  rs  ry  r  r  r  r  r  r  r  r  r  __all__r#   r$   r%   <module>r5     sY           ! ! ! ! ! ! " " " " " " " "          % % % % % % ! ! ! ! ! ! 9 9 9 9 9 9                . - - - - - v v v v v v v v v v 7 7 7 7 7 7 7 7 7 7 1 1 1 1 1 1 * * * * * * 
	H	%	%   
    !;     U\ e T V[Vb    (% % % % %29 % % % c7 c7 c7 c7 c7RY c7 c7 c7L#7 #7 #7 #7 #7") #7 #7 #7LV V V V V	 V V VrG# G# G# G# G#- G# G# G#T    RY   & !  ) ) ) ) )BI ) ) )X    ry    
 
 
 
 
 
 
 
> > > > >* > > >BP3 P3 P3 P3 P3ry P3 P3 P3fI
 I
 I
 I
 I
") I
 I
 I
X #O #O #O #O #O/ #O #O #OL T
 T
 T
 T
 T
# T
 T
 T
n       &   [
 [
 [
 [
 [
!4 [
 [
 [
|   9
 9
 9
 9
 9
!4 9
 9
 9
x" " " " "RY " " "J    bi   "" " " " "ry " " "JR R R R R29 R R Rj8 8 8 8 8") 8 8 8v M
 M
 M
 M
 M
"5 M
 M
 M
`   
t
 t
 t
 t
 t
& t
 t
 
t
n  r$   