
     `i              	          d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZmZmZ ddlmZmZm Z  ddl!m"Z"  ej#        e$          Z%e ed           G d de                                  Z&dJde
j'        de(de)de
j'        fdZ* G d dej+                  Z, G d dej+                  Z- G d d ej+                  Z. G d! d"ej+                  Z/ G d# d$e/          Z0 G d% d&ej+                  Z1e/e0d'Z2 G d( d)ej+                  Z3 G d* d+ej+                  Z4 G d, d-ej+                  Z5 G d. d/e          Z6 G d0 d1ej+                  Z7 G d2 d3ej+                  Z8e G d4 d5e                      Z9e G d6 d7e9                      Z: G d8 d9ej+                  Z; ed:           G d; d<e9                      Z< G d= d>ej+                  Z= G d? d@ej+                  Z> G dA dBej+                  Z? G dC dDej+                  Z@ G dE dFej+                  ZAe G dG dHe9                      ZBg dIZCdS )KzPyTorch Data2VecVision model.    N)	dataclass)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )Data2VecVisionConfigz7
    Class for outputs of [`Data2VecVisionModel`].
    )custom_introc                       e Zd ZdZdS )$Data2VecVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/data2vec/modeling_data2vec_vision.pyr   r   ,   s           r    r           Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r"   r   r   r   )dtypedevice)shapendimtorchrandr)   r*   floor_div)r#   r$   r%   	keep_probr+   random_tensoroutputs          r!   	drop_pathr4   =   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr    c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
Data2VecVisionDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr$   r&   c                 V    t                                                       || _        d S N)super__init__r$   )selfr$   	__class__s     r!   r:   zData2VecVisionDropPath.__init__U   s$    "r    hidden_statesc                 8    t          || j        | j                  S r8   )r4   r$   r%   r;   r=   s     r!   forwardzData2VecVisionDropPath.forwardY   s    FFFr    c                     d| j          S )Nzp=)r$   r;   s    r!   
extra_reprz!Data2VecVisionDropPath.extra_repr\   s    $DN$$$r    r8   )r   r   r   r   r   floatr:   r-   Tensorr@   strrC   __classcell__r<   s   @r!   r6   r6   R   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G%C % % % % % % % %r    r6   c            	            e Zd ZdZdeddf fdZdej        dededej        fd	Z		 	 dd
ej        de
ej                 de
e         dej        fdZ xZS )Data2VecVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr&   Nc                    t                                                       t          j        t	          j        dd|j                            | _        |j        r3t          j        t	          j        dd|j                            | _	        nd | _	        t          |          | _        |j        | _        t          |j        t          j        j                  r|j        n|j        |j        f| _        | j        j        }|j        r6t          j        t	          j        d|dz   |j                            | _        nd | _        t          j        |j                  | _        d S )Nr   )r9   r:   r   	Parameterr-   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenData2VecVisionPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r;   rK   r[   r<   s      r!   r:   z!Data2VecVisionEmbeddings.__init__g   s(   ek!Q8J&K&KLL  	# l5;q!V=O+P+PQQDOO"DO =f E E + &+[_-EFF8F#V%67 	
 +72 	,')|EK;QR?TZTf4g4g'h'hD$$'+D$z&"<==r    
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r+   r]   r-   jit
is_tracingrU   r   reshapepermuter   
functionalinterpolateviewcat)r;   ra   rb   rc   r[   num_positionsclass_pos_embedpatch_pos_embedrm   
new_height	new_widthsqrt_num_positionss               r!   interpolate_pos_encodingz1Data2VecVisionEmbeddings.interpolate_pos_encoding~   sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr    pixel_valuesbool_masked_posr|   c                 B   | j         |t          j        d           |j        \  }}}}|                     |          \  }\  }}	|                                \  }
}}|R| j                            |
|d          }|                    d          	                    |          }|d|z
  z  ||z  z   }| j
                            |
dd          }t          j        ||fd          }| j         ||                     |||          z   }|                     |          }|||	ffS )Nz`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always interpolated to the input image size. The argument will be removed in transformers v4.51.0.re   r   rl   )r]   warningswarnr+   rT   ri   rR   expand	unsqueezetype_asrP   r-   ru   r|   r`   )r;   r}   r~   r|   _rb   rc   ra   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r!   r@   z Data2VecVisionEmbeddings.forward   sD    #/4L4XMn  
 +01fe262G2G2U2U/
/\;!+!2!2
GQ&/00WbIIK))"--55kBBA#q1u-a?J^**:r2>>
Y
J7Q???
#/#d&C&CJPVX]&^&^^J\\*--
L+666r    NN)r   r   r   r   r   r:   r-   rE   intr|   r   
BoolTensorboolr@   rG   rH   s   @r!   rJ   rJ   a   s         
>3 > > > > > > >.&D5< &D &DUX &D]b]i &D &D &D &DV 7;37	7 7l7 "%"237 #+4.	7
 
7 7 7 7 7 7 7 7r    rJ   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )rS   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|d         |d         z  |d         |d         z  f}|| _        || _        || _        || _
        || _        t          j        ||||          | _        d S )Nr   r   kernel_sizestride)r9   r:   rW   rU   num_channelsrO   rV   rX   rY   rZ   r[   patch_shaper   Conv2d
projection)	r;   rK   rW   rU   r   rO   r[   r   r<   s	           r!   r:   z&Data2VecVisionPatchEmbeddings.__init__   s   !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&)L+:^hiiir    r}   r&   c                 
   |j         \  }}}}|| j        k    rt          d          |                     |          }|j         d         |j         d         }}|                    d                              dd          }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rf   r   r   )r+   r   
ValueErrorr   flatten	transpose)	r;   r}   r   r   rb   rc   ra   r   r   s	            r!   r@   z%Data2VecVisionPatchEmbeddings.forward   s    2>2D/
L&%4,,,w   __\22
$.$4Q$79I!9Lk''**44Q::
L+666r    )	r   r   r   r   r:   r-   rE   r@   rG   rH   s   @r!   rS   rS      sm         j j j j j"7EL 7U\ 7 7 7 7 7 7 7 7r    rS   c                       e Zd Zddedee         ddf fdZ	 	 	 	 	 ddej        deej                 d	e	d
eej                 de	deee
                  deeej                 eej        ej        f         f         fdZ xZS )Data2VecVisionSelfAttentionNrK   window_sizer&   c                    t                                                       || _        |j        |j        z  dk    r0t          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	        d          | _        t          j        |j        | j	                  | _        t          j        |j                  | _        t%          |          | _        | j        rt)          ||          | _        d S d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r9   r:   rK   rO   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer^   attention_probs_dropout_probr`   r   has_relative_position_bias"Data2VecVisionRelativePositionBiasrelative_position_biasr;   rK   r   r<   s      r!   r:   z$Data2VecVisionSelfAttention.__init__   se    ::a??PVXhHiHi?76#5 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1C%PPPYv143EFF
z&"EFF*.{*;*;'* 	n*LVal*m*m*mD'''	n 	nr    Fr=   	head_maskoutput_attentionsr   r|   
resolutionc                 r   |j         \  }}}	|                     |                              |d| j        | j                                      dd          }
|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }t          j	        |
|                    dd                    }|t          j        | j                  z  }| j        rI|\  }}|| j        j        z  || j        j        z  f}||                     |||j         d                   z   }|||z   }t           j                            |d          }|                     |          }|||z  }t          j	        ||          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||fn|f}|S )	Nre   r   rf   dim_sizerl   r   r   )r+   r   rt   r   r   r   r   r   r-   matmulmathsqrtr   rK   rU   r   r   rr   softmaxr`   rq   
contiguousri   r   )r;   r=   r   r   r   r|   r   r   
seq_lengthr   query_layer	key_layervalue_layerattention_scoresrb   rc   r   attention_probscontext_layernew_context_layer_shapeoutputss                        r!   r@   z#Data2VecVisionSelfAttention.forward  sG    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 !<Y5H5HR5P5PQQ+di8P.Q.QQ * 	&MFE!T[%;;UdkF\=\]K/$2M2M5@STU@V 3N 3 3  
 "-/2HH -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r    r8   NFNFN)r   r   r   r   r   tupler:   r-   rE   r   r   r   r@   rG   rH   s   @r!   r   r      s       n n3 n(5/ n]a n n n n n n4 -1"'9=).+/> >|> EL)>  	>
 !) 6> #'> U3Z(> 
uU\"E%,*D$EE	F> > > > > > > >r    r   c                        e Zd Z	 	 	 	 	 ddej        deej                 dedeej                 dedeee                  d	e	eej                 eej        ej        f         f         f fd
Z
 xZS )Data2VecVisionSdpaSelfAttentionNFr=   r   r   r   r|   r   r&   c           	      ~   |s|At                               d           t                                          ||||||          S |j        \  }}}	|                     |                              |d| j        | j                  	                    dd          }
| 
                    |                              |d| j        | j                  	                    dd          }|                     |                              |d| j        | j                  	                    dd          }d }| j        rF|\  }}|| j        j        z  || j        j        z  f}|                     |||j        d                   }|
||}n||z  }dt!          j        | j                  z  }t$          j        j                            |
|||| j        r| j        j        ndd|	          }|                    d
ddd                                          }|                                d d         | j        fz   } |j        | }|d fS )Na  `Data2VecVisionSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r=   r   r   r   r|   r   re   r   rf   r   r"   F)	attn_mask	dropout_p	is_causalscaler   r   r   )loggerwarning_oncer9   r@   r+   r   rt   r   r   r   r   r   r   rK   rU   r   r   r   r-   r   rr   scaled_dot_product_attentionr%   r   rq   r   ri   r   )r;   r=   r   r   r   r|   r   r   r   r   r   r   r   	attn_biasrb   rc   r   scalingr   r   r<   s                       r!   r@   z'Data2VecVisionSdpaSelfAttention.forwardJ  sp     		 5w   77??+#"3'=)A% #    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 	* 	&MFE!T[%;;UdkF\=\]K335@STU@V 4  I
 "- 2		33	di 8999+HHBF-Xdk>>UX I 
 
 &--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDd""r    r   )r   r   r   r-   rE   r   r   r   r   r   r@   rG   rH   s   @r!   r   r   I  s         -1"'9=).+/F# F#|F# EL)F#  	F#
 !) 6F# #'F# U3Z(F# 
uU\"E%,*D$EE	FF# F# F# F# F# F# F# F# F# F#r    r   c                   `     e Zd ZdZdeddf fdZd	dej        dej        dej        fdZ xZ	S )
Data2VecVisionSelfOutputz
    The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rK   r&   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r8   )	r9   r:   r   r   rO   denser^   r_   r`   r;   rK   r<   s     r!   r:   z!Data2VecVisionSelfOutput.__init__  sJ    Yv163EFF
z&"<==r    r=   input_tensorc                 Z    |                      |          }|                     |          }|S r8   r   r`   )r;   r=   r   gammas       r!   r@   z Data2VecVisionSelfOutput.forward  *    

=11]33r    r8   )
r   r   r   r   r   r:   r-   rE   r@   rG   rH   s   @r!   r   r     s         
>3 > > > > > > >
 U\  ^c^j        r    r   )eagersdpac                        e Zd Zddedee         ddf fdZd Z	 	 	 	 	 ddej	        d	eej	                 d
e
ded         de
deee                  deeej	                 eej	        ej	        f         f         fdZ xZS )Data2VecVisionAttentionNrK   r   r&   c                     t                                                       t          |j                 ||          | _        t          |          | _        t                      | _        d S )Nr   )	r9   r:   &DATA2VEC_VISION_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r3   setpruned_headsr   s      r!   r:   z Data2VecVisionAttention.__init__  s_    ?@[\
 
 
 /v66EEr    c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rl   )lenr   r   r   r   r   r   r   r   r   r3   r   r   union)r;   headsindexs      r!   prune_headsz#Data2VecVisionAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r    Fr=   r   r   r   r   r|   r   c                     |                      ||||||          }|                     |d         |          }|f|dd          z   }	|	S )Nr   r   )r   r3   )
r;   r=   r   r   r   r|   r   self_outputsattention_outputr   s
             r!   r@   zData2VecVisionAttention.forward  s]     ~~9&79OQiku
 
  ;;|AFF#%QRR(88r    r8   r   )r   r   r   r   r   r   r:   r   r-   rE   r   r   r   r@   rG   rH   s   @r!   r   r     s       " "3 "(5/ "]a " " " " " "; ; ;* -1"'QU).+/ | EL)  	
 !))M N #' U3Z( 
uU\"E%,*D$EE	F       r    r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )Data2VecVisionIntermediaterK   r&   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r8   )r9   r:   r   r   rO   intermediate_sizer   rV   
hidden_actrF   r	   intermediate_act_fnr   s     r!   r:   z#Data2VecVisionIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r    r=   c                 Z    |                      |          }|                     |          }|S r8   )r   r   r?   s     r!   r@   z"Data2VecVisionIntermediate.forward  s,    

=1100??r    	r   r   r   r   r:   r-   rE   r@   rG   rH   s   @r!   r   r     sr        93 9 9 9 9 9 9 9U\ el        r    r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )Data2VecVisionOutputrK   r&   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r8   )
r9   r:   r   r   r   rO   r   r^   r_   r`   r   s     r!   r:   zData2VecVisionOutput.__init__  sJ    Yv79KLL
z&"<==r    r=   c                 Z    |                      |          }|                     |          }|S r8   r   r?   s     r!   r@   zData2VecVisionOutput.forward  r   r    r   rH   s   @r!   r   r     sr        >3 > > > > > > >
U\ el        r    r   c                       e Zd ZdZ	 ddedee         deddf fdZ	 	 	 	 	 dd
e	j
        dee	j
                 dedee	j
                 dedeeeef                  deee	j
                 ee	j
        e	j
        f         f         fdZ xZS )Data2VecVisionLayerz?This corresponds to the Block class in the timm implementation.Nr"   rK   r   drop_path_rater&   c                    t                                                       |j        | _        d| _        t	          ||          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        |dk    rt          |          nt          j                    | _        t          j        |j        |j                  | _        |j        }|dk    rlt          j        |t+          j        |j                  z  d          | _        t          j        |t+          j        |j                  z  d          | _        d S d\  | _        | _        d S )	Nr   r   epsr"   r   T)requires_gradr   )r9   r:   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r3   r   	LayerNormrO   layer_norm_epslayernorm_beforer6   Identityr4   layernorm_afterlayer_scale_init_valuerM   r-   oneslambda_1lambda_2)r;   rK   r   r  init_valuesr<   s        r!   r:   zData2VecVisionLayer.__init__  s>    	'-'E$0[QQQ6v>>*622 "V-?VEZ [ [ [CQTWCWCW/???]_]h]j]j!|F,>FDYZZZ3??Luz&BT7U7U)UeijjjDMLuz&BT7U7U)UeijjjDMMM+5(DM4===r    Fr=   r   r   r   r|   r   c                    |                      |                     |          |||||          }|d         }|dd          }	| j        
| j        |z  }|                     |          |z   }|                     |          }
|                     |
          }
|                     |
          }
| j        
| j        |
z  }
|                     |
          |z   }
|
f|	z   }	|	S )N)r   r   r|   r   r   r   )r   r  r  r4   r  r  r3   r  )r;   r=   r   r   r   r|   r   self_attention_outputsr   r   layer_outputs              r!   r@   zData2VecVisionLayer.forward  s     "&!!-00/#9%=! "0 "
 "
 2!4(, =$#}/?? '788=H ++M::((66{{<00=$=<7L ~~l33mC/G+r    )Nr"   r   )r   r   r   r   r   r   r   rD   r:   r-   rE   r   r   r   r@   rG   rH   s   @r!   r   r     s       II jm6 6*69A%6af6	6 6 6 6 6 6. -1"'9=).04) )|) EL))  	)
 !) 6) #') U38_-) 
uU\"E%,*D$EE	F) ) ) ) ) ) ) )r    r   c                        e Zd Zdededdf fdZ ed          deeef         dej	        fd            Z
dd
edej	        fdZ xZS )r   rK   r   r&   Nc                    t                                                       || _        d|d         z  dz
  d|d         z  dz
  z  dz   | _        t	          j        t          j        | j        |j                            | _	        d S )Nrf   r   r   r   )
r9   r:   r   num_relative_distancer   rM   r-   rN   r   relative_position_bias_tabler   s      r!   r:   z+Data2VecVisionRelativePositionBias.__init__@  s    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LK2F4NOO-
 -
)))r    
   )maxsizec                 ^   d|d         z  dz
  d|d         z  dz
  z  dz   }|d         |d         z  }t          j        t          j        |d                   t          j        |d                   d          }t          j        |          }t          j        |d          }|dddddf         |dddddf         z
  }|                    ddd                                          }|dddddfxx         |d         dz
  z  cc<   |dddddfxx         |d         dz
  z  cc<   |dddddfxx         d|d         z  dz
  z  cc<   t          j        |dz   fdz  |j                  }|	                    d	          |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        rf   r   r   r   ij)indexingN)ri   r)   re   )r   r   )
r-   meshgridarangestackr   rq   r   rN   r)   sum)	r;   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r!    generate_relative_position_indexzCData2VecVisionRelativePositionBias.generate_relative_position_indexI  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~el;q>::ELUV<X<XcghhhT""vq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   KNQ$66   111a   KNQ$66   111a   AA$6$::   "'+K!O3E3IQ`Qf"g"g"g*9*=*=b*A*AABB')>)B122&)>)BA&(=(A%&&r    Fr|   c                 0   d| j         d         z  dz
  }d| j         d         z  dz
  }d|d         z  dz
  }d|d         z  dz
  }| j        }| j        }	||z  dz   }
|d|	dz
           }|                    d||d                              dddd          }t
          j                            |t          |          t          |          fd          }|                    dddd                              |
dz
  d          }t          j
        |||	dz
  d         g          }|                     |          }||                    d                   }|                    |d         |d         z  dz   |d         |d         z  dz   d          }|                    ddd                                          }|rKt
          j                            |                    d          ||fdd	
                              d          }|                    d          S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        rf   r   r   r   Nre   bilinear)ri   rj   Frh   )r   r  r  rp   rq   r   rr   rs   r   r-   ru   r)  rt   r   r   squeeze)r;   r   r|   r   
old_height	old_widthry   rz    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler(  r   s                   r!   r@   z*Data2VecVisionRelativePositionBias.forwardb  s`    )!,,q0
(++a/	Q'!+
A&*	+/+L($($>!$.$:Q$>!89X;TWX;X9XY%--aJKKSSTUWXZ[]^__11:!6!6	)8L8L MT^ 2 
 
 &--aAq99AAB[^_B_acdd+09<=VYZ=Z=\=\]^,
 ,
( #'"G"G"T"T!ABYB^B^_aBbBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "
 "8!?!?1a!H!H!S!S!U!U# 	%']%>%>&0033)#	 &? & &
 gajj # &//222r    )FN)r   r   r   r   r   r:   r   r   r-   rE   r)  r   r@   rG   rH   s   @r!   r   r   ?  s        
3 
% 
D 
 
 
 
 
 
 )(444'E#s(O 'PUP\ ' ' ' 54'0-3 -3T -3]b]i -3 -3 -3 -3 -3 -3 -3 -3r    r   c                        e Zd Zddedee         ddf fdZ	 	 	 	 	 	 ddej        d	eej                 d
e	de	de	deee
e
f                  de	deeef         fdZ xZS )Data2VecVisionEncoderNrK   r   r&   c                    t                                                       | _        j        | _        | j        rt                    | _        d t          j        dj	        j
        d          D             t          j        fdt          j
                  D                       | _        d| _        d S )Nr   c                 6    g | ]}|                                 S r   )item).0xs     r!   
<listcomp>z2Data2VecVisionEncoder.__init__.<locals>.<listcomp>  s     rrrAqvvxxrrrr    r   cpu)r*   c                 R    g | ]#}t          j        rnd |                   $S )N)r   r  )r   use_relative_position_bias)r:  irK   dprr   s     r!   r<  z2Data2VecVisionEncoder.__init__.<locals>.<listcomp>  sT         $/5/P ZVZ#&q6    r    F)r9   r:   rK   !use_shared_relative_position_biasr   r   r   r-   linspacer  num_hidden_layersr   
ModuleListrangelayergradient_checkpointing)r;   rK   r   rA  r<   s    ``@r!   r:   zData2VecVisionEncoder.__init__  s    *0*R'* 	n*LVal*m*m*mD' sr63H&Jbkp!q!q!qrrr]      v788  	
 	

 ',###r    FTr=   r   r   output_hidden_statesr|   r   return_dictc           	         |rdnd }|rdnd }	t          | j                  D ]\  }
}|r||fz   }| j        rG|\  }}|| j        j        z  || j        j        z  f}|                     |||j        d                   }nd }|||
         nd } |||||||          }|d         }|r|	|d         fz   }	|r||fz   }|st          d |||	fD                       S t          |||	          S )Nr   r   )r|   r   )r   r   r   r|   r   r   c              3      K   | ]}||V  	d S r8   r   )r:  vs     r!   	<genexpr>z0Data2VecVisionEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr    )last_hidden_stater=   
attentions)		enumeraterG  r   rK   rU   r   r+   r   r   )r;   r=   r   r   rI  r|   r   rJ  all_hidden_statesall_self_attentionsr@  layer_modulerb   rc   r   r   layer_head_masklayer_outputss                     r!   r@   zData2VecVisionEncoder.forward  s    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!. . *%)??$+J`A`a)-)D)D:R]j]pqr]s *E * *&& *.&.7.CillO(L)"3'=)A%  M *!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r    r8   )NFFFNT)r   r   r   r   r   r   r:   r-   rE   r   r   r   r   r@   rG   rH   s   @r!   r6  r6    s        , ,3 ,(5/ ,]a , , , , , ,0 -1"'%*).04 3
 3
|3
 EL)3
  	3

 #3
 #'3
 U38_-3
 3
 
uo%	&3
 3
 3
 3
 3
 3
 3
 3
r    r6  c                   <    e Zd ZU eed<   dZdZdZdgZdgZ	dZ
d ZdS )	Data2VecVisionPreTrainedModelrK   data2vec_visionr}   Tr   z.*relative_position_index.*c                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r_|j        j                            d| j        j	                   |j        +|j        j        |j                                                  dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS t          |t                     rl|j        j                                         |j        |j        j                                         |j         |j        j                                         dS dS t          |t(                    r |j        j                                         dS t          |t,                    r[|j        V|j        j                            | j        j                   |j        j                            | j        j                   dS dS dS )zInitialize the weightsr"   )meanstdNg      ?)rV   r   r   r   ConvTranspose2dweightdatanormal_rK   initializer_ranger   zero_	Embeddingpadding_idxr	  fill_rJ   rP   rR   r]   r   r  r   r  r  r  )r;   modules     r!   _init_weightsz+Data2VecVisionPreTrainedModel._init_weights  sL   fry")R5GHII 	O M&&CT[5R&SSS{& &&((((( '&-- 	OM&&CT[5R&SSS!-"6#56<<>>>>> .--- 	OK""$$$M$$S))))) 899 	O!''))) ,!&,,...)5*/5577777 65 BCC 	O/4::<<<<< 344 	O*$**4;+MNNN$**4;+MNNNNN	O 	O**r    N)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdparg  r   r    r!   rX  rX    s`          !   )$O&*#./*H)I&NO O O O Or    rX  c                        e Zd Zddededdf fdZd Zd Ze	 	 	 	 	 	 dd	e	j
        d
ee	j                 dee	j
                 dee         dee         dedee         deeef         fd            Z xZS )Data2VecVisionModelFrK   add_pooling_layerr&   Nc                    t                                          |           || _        t          |          | _        t          || j        j        j                  | _        |j	        rt          j                    nt          j        |j        |j                  | _        |rt!          |          nd| _        |                                  dS )zw
        add_pooling_layer (bool, *optional*, defaults to `False`):
            Whether to add a pooling layer
        r   r  N)r9   r:   rK   rJ   ra   r6  rT   r   encoderuse_mean_poolingr   r  r	  rO   r
  	layernormData2VecVisionPoolerpooler	post_init)r;   rK   rq  r<   s      r!   r:   zData2VecVisionModel.__init__
  s    
 	   26::,VAaAmnnn $4uBKMMM",vGY_e_t:u:u:u 	 7HQ*6222T 	r    c                     | j         j        S r8   )ra   rT   rB   s    r!   get_input_embeddingsz(Data2VecVisionModel.get_input_embeddings  s    //r    c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrs  rG  r   r   )r;   heads_to_prunerG  r   s       r!   _prune_headsz Data2VecVisionModel._prune_heads   sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr    r}   r~   r   r   rI  r|   rJ  c           	         ||n| j         j        }||n| j         j        }||n| j         j        }|                     || j         j                  }|                     ||          \  }}	|j        dd         }
|                     |||||
||          }|d         }| 	                    |          }| j
        | 
                    |          nd}|s|||fn|f}||dd         z   S t          |||j        |j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r~   rf   )r   r   rI  r   rJ  r|   r   r   )rO  pooler_outputr=   rP  )rK   r   rI  use_return_dictget_head_maskrD  ra   r+   rs  ru  rw  r   r=   rP  )r;   r}   r~   r   r   rI  r|   rJ  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r!   r@   zData2VecVisionModel.forward(  sW    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] &&y$+2OPP	"oolOo\\!!'+
,,/!5!#%= ' 
 
 *!,..998<8OO444UY 	6?L?XO];;_n^pL/!"""5553-')7&1	
 
 
 	
r    )F)NNNNFN)r   r   r   r   r   r:   rz  r~  r   r-   rE   r   r   r   r   r   r@   rG   rH   s   @r!   rp  rp    s)        3  Y]      &0 0 0C C C  7;,0,0/3).&*4
 4
l4
 "%"234
 EL)	4

 $D>4
 'tn4
 #'4
 d^4
 
u::	;4
 4
 4
 ^4
 4
 4
 4
 4
r    rp  c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )rv  rK   r&   Nc                     t                                                       |j        r t          j        |j        |j                  nd | _        d S )Nr  )r9   r:   rt  r   r	  rO   r
  ru  r   s     r!   r:   zData2VecVisionPooler.__init__b  sJ    KQKblBL+1FGGGGhl 	r    r=   c                     | j         :|d d dd d d f         }|                      |                    d                    }n|d d df         }|S )Nr   r   )ru  r[  )r;   r=   patch_tokensr  s       r!   r@   zData2VecVisionPooler.forwardh  sa    >%(ABB2L NN<+<+<Q+?+?@@MM *!!!Q$/Mr    r   rH   s   @r!   rv  rv  a  sr        
3 
 
 
 
 
 
 
	U\ 	el 	 	 	 	 	 	 	 	r    rv  z
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZe	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	ee	         d
ee	         de	dee	         de
eef         fd            Z xZS )$Data2VecVisionForImageClassificationrK   r&   Nc                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S )NTrq  r   )r9   r:   
num_labelsrp  rY  r   r   rO   r  
classifierrx  r   s     r!   r:   z-Data2VecVisionForImageClassification.__init__|  s        +26TRRR OUN_bcNcNc")F$68IJJJikitiviv 	r    Fr}   r   labelsr   rI  r|   rJ  c                 R   ||n| j         j        }|                     ||||||          }|r|j        n|d         }	|                     |	          }
d}||                     ||
| j                   }|s|
f|dd         z   }||f|z   n|S t          ||
|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   rI  r|   rJ  r   rf   losslogitsr=   rP  )	rK   r  rY  r  r  loss_functionr   r=   rP  )r;   r}   r   r  r   rI  r|   rJ  r   r  r  r  r3   s                r!   r@   z,Data2VecVisionForImageClassification.forward  s    " &1%<kk$+B]&&/!5%=# ' 
 
 2=L--'!*//%%ffdkBBD 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r    NNNNNFN)r   r   r   r   r:   r   r   r-   rE   r   r   r   r   r@   rG   rH   s   @r!   r  r  t  s       
3 
 
 
 
 
 
 
  04,0)-,0/3).&*+
 +
u|,+
 EL)+
 &	+

 $D>+
 'tn+
 #'+
 d^+
 
u++	,+
 +
 +
 ^+
 +
 +
 +
 +
r    r  c                        e Zd ZdZ	 	 	 ddededeeeeef         f         deeeeef         ef         d	ed
eeeeef         f         ddf fdZ	de
j        de
j        fdZ xZS )Data2VecVisionConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   in_channelsout_channelsr   paddingr   dilationr&   Nc                     t                                                       t          j        ||||||          | _        t          j        |          | _        t          j                    | _        d S )N)r  r  r   r  r   r  )	r9   r:   r   r   convBatchNorm2dbnReLU
activation)r;   r  r  r   r  r   r  r<   s          r!   r:   z!Data2VecVisionConvModule.__init__  si     	I#%#
 
 
	 ...'))r    r#   c                     |                      |          }|                     |          }|                     |          }|S r8   )r  r  r  )r;   r#   r3   s      r!   r@   z Data2VecVisionConvModule.forward  s8    5!!((r    )r   Fr   )r   r   r   r   r   r   r   rF   r   r:   r-   rE   r@   rG   rH   s   @r!   r  r    s          5601$ $$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$ $ $ $ $ $*U\ el        r    r  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZS )	!Data2VecVisionPyramidPoolingBlock
pool_scaler  channelsr&   Nc                    t                                                       t          j        |          t	          ||d          g| _        t          | j                  D ](\  }}|                     t          |          |           )d S )Nr   r   )	r9   r:   r   AdaptiveAvgPool2dr  layersrQ  
add_modulerF   )r;   r  r  r  r@  rG  r<   s         r!   r:   z*Data2VecVisionPyramidPoolingBlock.__init__  s     ,,$[(JJJ
 "$+.. 	+ 	+HAuOOCFFE****	+ 	+r    r#   c                 4    |}| j         D ]} ||          }|S r8   )r  )r;   r#   hidden_staterG  s       r!   r@   z)Data2VecVisionPyramidPoolingBlock.forward  s/    [ 	/ 	/E 5..LLr    )	r   r   r   r   r:   r-   rE   r@   rG   rH   s   @r!   r  r    s        +3 +S +C +D + + + + + +U\ el        r    r  c            
       x     e Zd ZdZdeedf         dedededdf
 fd	Zd
ej	        de
ej	                 fdZ xZS )"Data2VecVisionPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rk   r&   Nc                 V   t                                                       || _        || _        || _        || _        g | _        t          |          D ]T\  }}t          |||          }| j        	                    |           | 
                    t          |          |           Ud S )N)r  r  r  )r9   r:   r  rk   r  r  blocksrQ  r  appendr  rF   )	r;   r  r  r  rk   r@  r  blockr<   s	           r!   r:   z+Data2VecVisionPyramidPoolingModule.__init__  s    &*& &{33 	+ 	+MAz5%;  E Ku%%%OOCFFE****	+ 	+r    r;  c                     g }| j         D ]d} ||          }t          j                            ||                                dd          d| j                  }|                    |           e|S )Nrf   r+  rh   )r  r   rr   rs   ri   rk   r  )r;   r;  ppm_outsppmppm_outupsampled_ppm_outs         r!   r@   z*Data2VecVisionPyramidPoolingModule.forward  s{    ; 	/ 	/Cc!ffG " 9 9affhhqrrl4K] !: ! ! OO-....r    )r   r   r   r   r   r   r   r:   r-   rE   listr@   rG   rH   s   @r!   r  r    s         +E#s(O +# +QT +ei +nr + + + + + + $u|*<        r    r  c                   V     e Zd ZdZdeddf fdZd Zdej        dej        fdZ	 xZ
S )	Data2VecVisionUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rK   r&   Nc                    t                                                       |j        | _        |j        gdz  | _        |j        | _        d| _        t          j        | j        |j	        d          | _
        t          | j        | j        d         | j        | j                  | _        t          | j        d         t          | j                  | j        z  z   | j        dd          | _        t          j                    | _        t          j                    | _        | j        d d         D ]j}t          || j        d          }t          | j        | j        dd          }| j                            |           | j                            |           kt          t          | j                  | j        z  | j        dd          | _        d S )	N   Fr   r  re   )rk   r   r   r  )r9   r:   r  rO   r  r  rk   r   r   r  r  r  psp_modulesr  r   
bottleneckrE  lateral_convs	fpn_convsr  fpn_bottleneck)r;   rK   r  l_convfpn_convr<   s        r!   r:   zData2VecVisionUperHead.__init__   s   !-"./!3*")DM63DRSTTT >R M,	
 
 
 3R 3t'7#8#84=#HHM	
 
 
  ]__+CRC0 	, 	,K-k4=VWXXXF/t}Z[efgggH%%f---N!!(++++6 !!DM1M	
 
 
r    c                     |d         }|g}|                     |                     |                     t          j        |d          }|                     |          }|S )Nre   r   rl   )extendr  r-   ru   r  )r;   inputsr;  psp_outsr3   s        r!   psp_forwardz"Data2VecVisionUperHead.psp_forwardF  s\    2J3((++,,,9X1---**r    encoder_hidden_statesc                 B    fdt           j                  D                                                                           t	                    }t          |dz
  dd          D ]Z}|dz
           j        dd          }|dz
           t          j        	                    |         |d j
                  z   |dz
  <   [ fdt          |dz
            D             }|                    d                    t          |dz
  dd          D ]F}t          j        	                    ||         |d         j        dd          d j
                  ||<   Gt          j        |d	          }                     |          }                     |          }|S )
Nc                 8    g | ]\  }} ||                   S r   r   )r:  r@  lateral_convr  s      r!   r<  z2Data2VecVisionUperHead.forward.<locals>.<listcomp>Q  s-    pppq,LL!6q!9::pppr    r   r   re   rf   r+  rh   c                 H    g | ]} j         |         |                   S r   )r  )r:  r@  lateralsr;   s     r!   r<  z2Data2VecVisionUperHead.forward.<locals>.<listcomp>^  s/    \\\q%DN1%hqk22\\\r    rl   )rQ  r  r  r  r   rF  r+   r   rr   rs   rk   r-   ru   r  r  )r;   r  used_backbone_levelsr@  
prev_shapefpn_outsr3   r  s   ``     @r!   r@   zData2VecVisionUperHead.forwardO  s   ppppR[\`\nRoRoppp(()>??@@@  #8}}+a/B77 	 	A!!a%.qrr2J&q1uo0I0I*:TM_ 1J 1 1 HQUOO
 ]\\\\EBVYZBZ<[<[\\\%%%+a/B77 	 	A-33(1+"3ABB"7jX\Xj 4  HQKK 9X1---$$X..((r    )r   r   r   r   r   r:   r  r-   rE   r@   rG   rH   s   @r!   r  r    s         $
3 $
 $
 $
 $
 $
 $
 $
L  U\ el        r    r  c                        e Zd ZdZ	 	 	 ddedededeeeeef         f         d	d
f
 fdZde	j
        d	e	j
        fdZ xZS )Data2VecVisionFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (Data2VecVisionConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rf   r   r   rK   in_indexr   r  r&   Nc           
         t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        || _
        |dz  |z  }g }|                    t          | j        | j        |||                     t          | j        dz
            D ]3}|                    t          | j        | j        |||                     4| j        dk    rt          j                    | _        nt          j        | | _        | j	        r-t          | j        | j        z   | j        ||dz            | _        t          j        | j        |j        d          | _        d S )Nrf   )r   r  r  r   r   r  r  )r9   r:   rO   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  rF  r   r  convs
Sequentialconv_catr   r  r  )	r;   rK   r  r   r  conv_paddingr  r@  r<   s	           r!   r:   zData2VecVisionFCNHead.__init__}  s    	!-13"9 #q(H4$ $-[R^iq  	
 	
 	

 t~)** 	 	ALL(M4=kS_jr     
 >QDJJ.DJ 	4 4=0$-[bmqrbr  DM )DM63DRSTTTr    r  c                     || j                  }|                     |          }| j        r+|                     t	          j        ||gd                    }|                     |          }|S )Nr   rl   )r  r  r  r  r-   ru   r  )r;   r  r=   r3   s       r!   r@   zData2VecVisionFCNHead.forward  sf    -dm<M** 	N]]59mV-D!#L#L#LMMF((r    )rf   r   r   )r   r   r   r   r   r   r   r   r:   r-   rE   r@   rG   rH   s   @r!   r  r  n  s         " 01$U $U$$U $U 	$U
 U38_,-$U 
$U $U $U $U $U $ULU\ el        r    r  c                        e Zd Zdeddf fdZd Ze	 	 	 	 	 	 	 ddeej	                 deej	                 d	eej	                 d
ee
         dee
         de
dee
         deeef         fd            Z xZS )%Data2VecVisionForSemanticSegmentationrK   r&   Nc                 P   t                                          |           |j        | _        t          |d          | _        t          | j        j                  dk    rt          d          t          j
        t          j        |j        |j        dd          t          j        |j                  t          j                    t          j        |j        |j        dd                    | _        t          j
        t          j        |j        |j        dd                    | _        t          j                    | _        t          j        dd          | _        t+          |          | _        |j        rt1          |          nd | _        |                                  d S )NFr  r  zData2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rf   r   )r9   r:   r  rp  rY  r   rK   out_indicesr   r   r  r]  rO   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrx  r   s     r!   r:   z.Data2VecVisionForSemanticSegmentation.__init__  sy       +26USSS t{&''1,,-  
 Mv163EST]^___N6-..GIIv163EST]^___	
 
	 Mv163EST]^___
 
	 KMM	LQq999	 2&99?E?Xb3F;;;^b 	r    c                 Z   t           j                            ||j        dd          dd          }|0t           j                            ||j        dd          dd          }t	          | j        j                  } |||          }|}| |||          }	|| j        j        |	z  z  }|S )Nr   r+  Frh   )ignore_index)r   rr   rs   r+   r   rK   semantic_loss_ignore_indexauxiliary_loss_weight)
r;   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossr  auxiliary_losss
             r!   compute_lossz2Data2VecVisionForSemanticSegmentation.compute_loss  s    =44bcc*5 5 
 
 ')+)B)B v|BCC'8zY^ *C * *& $1WXXXH-v66	'%X&@&IINDK5FFDr    Fr}   r   r  r   rI  r|   rJ  c                 \    ||n j         j        }||n j         j        }| j         j        dk    rt	          d                               |||d||          }|r|j        n|d         }	 fdt          |	          D             }
|j        d          j         j	         j         j
        z  fd|
D             }
 j         j         j         j        g}t          t!          |
                    D ]} ||         |
|                   |
|<                        |
          }d} j                             |
          }d}|                     |||          }|s)|r|f|dd         z   }n|f|d	d         z   }||f|z   n|S t)          |||r|j        nd|j        
          S )a@  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
        >>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  c                 <    g | ]\  }}|d z   j         j        v |S r(   )rK   r  )r:  idxfeaturer;   s      r!   r<  zAData2VecVisionForSemanticSegmentation.forward.<locals>.<listcomp>  s5    wwwWTWZ[T[_c_j_vTvTvGTvTvTvr    r   c                     g | ]>}|d d dd d d f                              ddd                              d          ?S )Nr   r   rf   re   )rq   rp   )r:  r;  r   patch_resolutions     r!   r<  zAData2VecVisionForSemanticSegmentation.forward.<locals>.<listcomp>  sd     
 
 
ijAaaaQQQhK1a((00RAQScdd
 
 
r    rf   r  )rK   r  rI  r  r   rY  r=   rQ  r+   rW   rU   r  r  r  r  rF  r   r  r  r  r   rP  )r;   r}   r   r  r   rI  r|   rJ  r   r  featuresopsr@  r  r  r  r3   r   r  s   `                @@r!   r@   z-Data2VecVisionForSemanticSegmentation.forward  sD   D &1%<kk$+B]$8$D  $+Jj 	 $+"8A"="=NOOO&&/!%%=# ' 
 
 :E T 5 5'RS* xwww	:O0P0Pwww!'*
;1T[5KK
 
 
 
 
nv
 
 

 y$)TY	:s8}}%% 	. 	.A #a&!--HQKK!!(++*#228<<$$V-=vFFD 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T)	
 
 
 	
r    r  )r   r   r   r   r:   r  r   r   r-   rE   r   r   r   r   r@   rG   rH   s   @r!   r  r    s#       3       @  &  04,0)-,0/3).&*X
 X
u|,X
 EL)X
 &	X

 $D>X
 'tnX
 #'X
 d^X
 
u--	.X
 X
 X
 ^X
 X
 X
 X
 X
r    r  )r  r  rp  rX  )r"   F)Dr   collections.abcrX   r   r   dataclassesr   typingr   r   r-   r   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_data2vec_visionr   
get_loggerr   r   r   rE   rD   r   r4   Moduler6   rJ   rS   r   r   r   r   r   r   r   r   r   r6  rX  rp  rv  r  r  r  r  r  r  r  __all__r   r    r!   <module>r     s   $ #       ! ! ! ! ! ! " " " " " " " "        % % % % % % ! ! ! ! ! ! 9 9 9 9 9 9            . - - - - - v v v v v v v v v v 7 7 7 7 7 7 7 7 7 7 ? ? ? ? ? ? 
	H	%	%       +E     U\ e T V[Vb    *% % % % %RY % % %c7 c7 c7 c7 c7ry c7 c7 c7N#7 #7 #7 #7 #7BI #7 #7 #7NV V V V V") V V VtG# G# G# G# G#&A G# G# G#V    ry   & )+* * &+ + + + +bi + + +^       "
 
 
 
 
29 
 
 
@ @ @ @ @4 @ @ @HP3 P3 P3 P3 P3 P3 P3 P3hI
 I
 I
 I
 I
BI I
 I
 I
X #O #O #O #O #OO #O #O #OL T
 T
 T
 T
 T
7 T
 T
 T
p    29   &   9
 9
 9
 9
 9
+H 9
 9
 9
z" " " " "ry " " "L    	   $$ $ $ $ $ $ $ $PR R R R RRY R R Rl< < < < <BI < < <~ M
 M
 M
 M
 M
,I M
 M
 M
`  r    