
     `ip                        d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$  ej%        e&          Z' G d de	j(                  Z) G d de	j(                  Z*	 d9de	j(        dej+        dej+        dej+        deej+                 de,de,fdZ- G d de	j(                  Z. G d d e	j(                  Z/ G d! d"e	j(                  Z0 G d# d$e	j(                  Z1 G d% d&e	j(                  Z2 G d' d(e          Z3 G d) d*e	j(                  Z4e G d+ d,e                      Z5e G d- d.e5                      Z6 G d/ d0e	j(                  Z7 ed12           G d3 d4e5                      Z8 ed52           G d6 d7e5                      Z9g d8Z:dS ):zPyTorch ViT model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )	ViTConfigc            	            e Zd ZdZddedef fdZdej        de	de	d	ej        fd
Z
	 	 ddej        deej                 ded	ej        fdZ xZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenc                 $   t                                                       t          j        t	          j        dd|j                            | _        |r-t          j        t	          j        dd|j                            nd | _	        t          |          | _        | j        j        }t          j        t	          j        d|dz   |j                            | _        t          j        |j                  | _        |j        | _        || _        d S )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r   r+   	__class__s       x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vit/modeling_vit.pyr!   zViTEmbeddings.__init__0   s    ek!Q8J&K&KLLQ_i",u{1a9K'L'LMMMei 26 : :+7#%<A{QPVPb0c0c#d#d z&"<== +    
embeddingsheightwidthreturnc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   N      ?r   r      bicubicF)sizemodealign_cornersdim)shaper,   r#   jit
is_tracingr0   r   reshapepermuter   
functionalinterpolateviewcat)r1   r5   r6   r7   r+   num_positionsclass_pos_embedpatch_pos_embedrB   
new_height	new_widthsqrt_num_positionss               r3   interpolate_pos_encodingz&ViTEmbeddings.interpolate_pos_encoding<   sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr4   Npixel_valuesbool_masked_posrR   c                    |j         \  }}}}|                     ||          }|_|j         d         }	| j                            ||	d          }
|                    d                              |
          }|d|z
  z  |
|z  z   }| j                            |dd          }t          j        ||fd          }|r|| 	                    |||          z   }n
|| j
        z   }|                     |          }|S )N)rR   r   r:         ?rA   )rC   r*   r(   expand	unsqueezetype_asr&   r#   rK   rR   r,   r/   )r1   rS   rT   rR   
batch_sizenum_channelsr6   r7   r5   
seq_lengthmask_tokensmask
cls_tokenss                r3   forwardzViTEmbeddings.forwardd   s    3?2D/
L&%**<Rj*kk
&#)!,J/00ZLLK",,R0088EED#sTz2[45GGJ ^**:r2>>
Y
J7Q???
 $ 	?#d&C&CJPVX]&^&^^JJ#d&>>J\\*--
r4   F)NF)__name__
__module____qualname____doc__r   boolr!   r#   TensorintrR   r   
BoolTensorr`   __classcell__r2   s   @r3   r   r   +   s         
 
y 
$ 
 
 
 
 
 
&D5< &D &DUX &D]b]i &D &D &D &DV 7;).	 l "%"23 #'	
 
       r4   r   c                   R     e Zd ZdZdef fdZd	dej        dedej        fdZ	 xZ
S )
r)   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r   c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r    r!   
image_sizer0   r[   r%   
isinstancecollectionsabcIterabler+   r   Conv2d
projection)r1   r   rp   r0   r[   r%   r+   r2   s          r3   r!   zViTPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir4   FrS   rR   r8   c                    |j         \  }}}}|| j        k    rt          d| j         d| d          |sT|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d		          |                     |                              d
                              dd
          }|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r<   )rC   r[   
ValueErrorrp   rv   flatten	transpose)r1   rS   rR   rZ   r[   r6   r7   r5   s           r3   r`   zViTPatchEmbeddings.forward   s*   2>2D/
L&%4,,,I!.I I9EI I I   ( 	+++u8J/J/J E E E% E E+E E.2oa.@E E E   __\22::1==GG1MM
r4   ra   )rb   rc   rd   re   r   r!   r#   rg   rf   r`   rj   rk   s   @r3   r)   r)      s         jy j j j j j j EL D ]b]i        r4   r)           modulequerykeyvalueattention_maskscalingr/   c                    t          j        ||                    dd                    |z  }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }|||z  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr:   )rB   dtype)ptrainingr   r<   )r#   matmulr|   r   rH   softmaxfloat32tor   r/   r   
contiguous)
r~   r   r   r   r   r   r/   kwargsattn_weightsattn_outputs
             r3   eager_attention_forwardr      s     <s}}R'<'<==GL =((2U](SSVVW\WbccL =((6?([[L !#n4,|U33K''1--88::K$$r4   c            	            e Zd Zdef fdZ	 ddej        deej                 deej        ej        f         fdZ	 xZ
S )	ViTSelfAttentionr   c                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          || _        |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        |j
        | _        | j        dz  | _        d| _        t          j        |j        | j	        |j                  | _        t          j        |j        | j	        |j                  | _        t          j        |j        | j	        |j                  | _        d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rx   g      F)bias)r    r!   r%   num_attention_headshasattrrz   r   rh   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r1   r   r2   s     r3   r!   zViTSelfAttention.__init__   sB    ::a??PVXhHiHi?76#5 7 737 7 7  
 #)#= #&v'9F<V'V#W#W !58PP"?/5Yv143EFO\\\
9V/1C&/ZZZYv143EFO\\\


r4   Nhidden_states	head_maskr8   c           
         |j         d         }|d| j        | j        f} |                     |          j        |                     dd          } |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j	        j
        dk    rt          | j	        j
                 } || ||||| j        | j        | j        sdn| j                  \  }	}
|	                                d d         | j        fz   }|	                    |          }	|	|
fS )	Nr   r:   r   r<   eagerr}   )r   r   r/   r   )rC   r   r   r   rJ   r|   r   r   r   r   _attn_implementationr   r   r   r   r   r>   r   rF   )r1   r   r   rZ   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r3   r`   zViTSelfAttention.forward   sY    #(+
D$<d>VV	0DHH]++0)<FFq!LL	4djj//4i@JJ1aPP4djj//4i@JJ1aPP(?;+w66"9$+:Z"[)<)<nL#}CCC$2C	*
 	*
 	*
& #0"4"4"6"6ss";t?Q>S"S%--.EFFo--r4   N)rb   rc   rd   r   r!   r#   rg   r   tupler`   rj   rk   s   @r3   r   r      s        ]y ] ] ] ] ] ]* PT. ."\.6>u|6L.	u|U\)	*. . . . . . . .r4   r   c                   Z     e Zd ZdZdef fdZdej        dej        dej        fdZ xZ	S )ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )	r    r!   r   r   r%   denser-   r.   r/   r   s     r3   r!   zViTSelfOutput.__init__  sJ    Yv163EFF
z&"<==r4   r   input_tensorr8   c                 Z    |                      |          }|                     |          }|S r   r   r/   r1   r   r   s      r3   r`   zViTSelfOutput.forward  s*    

=11]33r4   )
rb   rc   rd   re   r   r!   r#   rg   r`   rj   rk   s   @r3   r   r      s         
>y > > > > > >
U\  RWR^        r4   r   c                   |     e Zd Zdef fdZdee         fdZd
dej	        de
ej	                 dej	        fd	Z xZS )ViTAttentionr   c                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )r    r!   r   	attentionr   outputsetpruned_headsr   s     r3   r!   zViTAttention.__init__  sI    )&11#F++EEr4   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rA   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r1   r   indexs      r3   prune_headszViTAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r4   Nr   r   r8   c                 d    |                      ||          \  }}|                     ||          }|S r   )r   r   )r1   r   r   self_attn_output_r   s         r3   r`   zViTAttention.forward&  s4    "nn]IFF!-}==r4   r   )rb   rc   rd   r   r!   r   rh   r   r#   rg   r   r`   rj   rk   s   @r3   r   r     s        "y " " " " " ";S ; ; ; ;$ U\ hu|>T `e`l        r4   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )ViTIntermediater   c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r    r!   r   r   r%   intermediate_sizer   rq   
hidden_actstrr   intermediate_act_fnr   s     r3   r!   zViTIntermediate.__init__-  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r4   r   r8   c                 Z    |                      |          }|                     |          }|S r   )r   r   )r1   r   s     r3   r`   zViTIntermediate.forward5  s,    

=1100??r4   	rb   rc   rd   r   r!   r#   rg   r`   rj   rk   s   @r3   r   r   ,  sj        9y 9 9 9 9 9 9U\ el        r4   r   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )	ViTOutputr   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r   )
r    r!   r   r   r   r%   r   r-   r.   r/   r   s     r3   r!   zViTOutput.__init__<  sJ    Yv79KLL
z&"<==r4   r   r   r8   c                 d    |                      |          }|                     |          }||z   }|S r   r   r   s      r3   r`   zViTOutput.forwardA  s4    

=11]33%4r4   r   rk   s   @r3   r   r   ;  su        >y > > > > > >
U\  RWR^        r4   r   c                   h     e Zd ZdZdef fdZd	dej        deej                 dej        fdZ	 xZ
S )
ViTLayerz?This corresponds to the Block class in the timm implementation.r   c                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r    r!   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr%   layer_norm_epslayernorm_beforelayernorm_afterr   s     r3   r!   zViTLayer.__init__K  s    '-'E$%f--+F33'' "V-?VEZ [ [ [!|F,>FDYZZZr4   Nr   r   r8   c                     |                      |          }|                     ||          }||z   }|                     |          }|                     |          }|                     ||          }|S r   )r   r   r   r   r   )r1   r   r   hidden_states_normattention_outputlayer_outputs         r3   r`   zViTLayer.forwardU  sz    !22=AA>>*<iHH )=8 ++M::((66 {{<??r4   r   )rb   rc   rd   re   r   r!   r#   rg   r   r`   rj   rk   s   @r3   r   r   H  s        II[y [ [ [ [ [ [ U\ hu|>T `e`l        r4   r   c                   Z     e Zd Zdef fdZddej        deej                 defdZ	 xZ
S )	
ViTEncoderr   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0r   r   s     r3   
<listcomp>z'ViTEncoder.__init__.<locals>.<listcomp>j  s!    #^#^#^HV$4$4#^#^#^r4   F)	r    r!   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r3   r!   zViTEncoder.__init__g  s`    ]#^#^#^#^eFD\>]>]#^#^#^__
&+###r4   Nr   r   r8   c                     t          | j                  D ]\  }}|||         nd } |||          }t          |          S )N)last_hidden_state)	enumerater   r
   )r1   r   r   ilayer_modulelayer_head_masks         r3   r`   zViTEncoder.forwardm  sW    (44 	I 	IOA|.7.CillO(LHHMM????r4   r   )rb   rc   rd   r   r!   r#   rg   r   r
   r`   rj   rk   s   @r3   r   r   f  s        ,y , , , , , ,@ @U\ @hu|>T @`o @ @ @ @ @ @ @ @r4   r   c                       e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdZeedZdeej        ej        ej        f         fd	Zd
S )ViTPreTrainedModelr   vitrS   Tr   r   )r   
attentionsr~   c                    t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t$                    rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        t          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j        "|j        j                                         dS dS dS )zInitialize the weightsr}   )meanstdNrV   )rq   r   r   ru   inittrunc_normal_weightdatar   r#   r   r   initializer_ranger   r   zero_r   fill_r   r,   r&   r(   )r1   r~   s     r3   _init_weightsz ViTPreTrainedModel._init_weights  s   fry")455 	/ "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	/K""$$$M$$S))))).. 	/.0g.C.C*/225=AAK1 /D / / b+122	 &+ %'G$9$9 %((77K1 %: % % b!'((	 !  ,!&,,.....	/ 	/ -,r4   N)rb   rc   rd   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r   r   ru   r   r  r   r4   r3   r   r   u  s         $O&*#(*5N"&!& 
/E")RY*L$M / / / / / /r4   r   c                       e Zd Zddededef fdZdefdZd	ee	e
e	         f         fd
Z ed          e	 	 	 	 ddeej                 deej                 deej                 dee         dee         defd                        Z xZS )ViTModelTFr   add_pooling_layerr   c                 N   t                                          |           || _        t          ||          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd| _        |                                  dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r    r!   r   r   r5   r   encoderr   r   r%   r   	layernorm	ViTPoolerpooler	post_init)r1   r   r  r   r2   s       r3   r!   zViTModel.__init__  s     	   '~NNN!&))f&8f>STTT+<Fi'''$ 	r4   r8   c                     | j         j        S r   )r5   r*   )r1   s    r3   get_input_embeddingszViTModel.get_input_embeddings  s    //r4   heads_to_prunec                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r1   r  r   r   s       r3   _prune_headszViTModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr4   )tie_last_hidden_statesNrS   rT   r   rR   r   c                    |t          d          |                     || j        j                  }| j        j        j        j        j        }|j        |k    r|	                    |          }|                     |||          }| 
                    ||          }|j        }	|                     |	          }	| j        |                     |	          nd}
t          |	|
          S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rT   rR   )r   )r   pooler_output)rz   get_head_maskr   r   r5   r*   rv   r   r   r   r  r   r  r  r   )r1   rS   rT   r   rR   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputs              r3   r`   zViTModel.forward  s     ?@@@ &&y$+2OPP	 9DKQ//'??>::L??/Tl + 
 
 ,0<<8HT]<+^+^);..998<8OO444UY)O[hiiiir4   )TFNNNN)rb   rc   rd   r   rf   r!   r)   r  dictrh   listr  r   r   r   r#   rg   ri   r   r   r   r`   rj   rk   s   @r3   r  r    sV        y T Z^      &0&8 0 0 0 0C4T#Y+? C C C C u555 046:,037&j &ju|,&j "%"23&j EL)	&j
 #+4.&j +,&j 
$&j &j &j ^ 65&j &j &j &j &jr4   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )r  r   c                     t                                                       t          j        |j        |j                  | _        t          |j                 | _	        d S r   )
r    r!   r   r   r%   pooler_output_sizer   r   
pooler_act
activationr   s     r3   r!   zViTPooler.__init__  sE    Yv163LMM
 !23r4   r   r8   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r/  )r1   r   first_token_tensorr'  s       r3   r`   zViTPooler.forward  s@     +111a40

#56666r4   r   rk   s   @r3   r  r    sj        4y 4 4 4 4 4 4
U\ el        r4   r  ac  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                        e Zd Zdef fdZee	 	 	 	 ddeej	                 deej
                 deej	                 dee         dee         d	efd
                        Z xZS )ViTForMaskedImageModelingr   c                 V   t                                          |           t          |dd          | _        t	          j        t	          j        |j        |j        dz  |j	        z  d          t	          j
        |j                            | _        |                                  d S )NFT)r  r   r<   r   )in_channelsout_channelsrn   )r    r!   r  r   r   
Sequentialru   r%   encoder_strider[   PixelShuffledecoderr  r   s     r3   r!   z"ViTForMaskedImageModeling.__init__
  s       FeDQQQ}I".#2A58KK  
 OF122
 
 	r4   NrS   rT   r   rR   r   r8   c                    |D| j         j        | j         j        k    r*t          d| j         j         d| j         j         d           | j        |f|||d|}|j        }|ddddf         }|j        \  }}	}
t          j        |	dz            x}}|	                    dd	d          
                    ||
||          }|                     |          }d}|| j         j        | j         j        z  }|
                    d
||          }|                    | j         j        d                              | j         j        d	                              d                                          }t           j                            ||d          }||z                                  |                                dz   z  | j         j        z  }t+          |||j        |j                  S )a+  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = rx   )rT   r   rR   r   r;   r   r<   r:   none)	reductiongh㈵>)lossreconstructionr   r   )r   r0   r9  rz   r   r   rC   mathfloorrG   rF   r;  rp   repeat_interleaverX   r   r   rH   l1_losssumr[   r   r   r   )r1   rS   rT   r   rR   r   outputsr&  rZ   sequence_lengthr[   r6   r7   reconstructed_pixel_valuesmasked_im_lossr>   r^   reconstruction_losss                     r3   r`   z!ViTForMaskedImageModeling.forward  s!   L &DK,BdkF`,`,`t&*k&<t tVZVaVpt t t   /7dh/
+%=	/
 /

 /
 /
 "3 *!!!QRR%04C4I1
O\OS$8999)11!Q::BB:|]cejkk &*\\/%B%B"&;)T[-CCD-55b$EEO11$+2H!LL""4;#91==1	  #%-"7"7F`lr"7"s"s1D8==??488::PTCTUX\XcXppN(5!/)	
 
 
 	
r4   r(  )rb   rc   rd   r   r!   r   r   r   r#   rg   ri   rf   r   r   r   r`   rj   rk   s   @r3   r4  r4    s        y      "  046:,037P
 P
u|,P
 "%"23P
 EL)	P

 #+4.P
 +,P
 
#P
 P
 P
 ^ P
 P
 P
 P
 P
r4   r4  a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                        e Zd Zdef fdZee	 	 	 	 ddeej	                 deej	                 deej	                 dee
         dee         d	efd
                        Z xZS )ViTForImageClassificationr   c                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S )NF)r  r   )r    r!   
num_labelsr  r   r   r   r%   Identity
classifierr  r   s     r3   r!   z"ViTForImageClassification.__init__  s        +Fe<<< OUN_bcNcNc")F$68IJJJikitiviv 	r4   NrS   r   labelsrR   r   r8   c                      | j         |f||d|}|j        }|dddddf         }|                     |          }	d}
| | j        ||	| j        fi |}
t          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rR   Nr   )r?  logitsr   r   )r   r   rP  loss_functionr   r   r   r   )r1   rS   r   rQ  rR   r   rF  r&  r'  rS  r?  s              r3   r`   z!ViTForImageClassification.forward  s    " /7dh/
%=/
 /
 	/
 /
 "3'1aaa0//%4%ffdkLLVLLD$!/)	
 
 
 	
r4   r(  )rb   rc   rd   r   r!   r   r   r   r#   rg   rf   r   r   r   r`   rj   rk   s   @r3   rL  rL  p  s        
y 
 
 
 
 
 
  04,0)-37#
 #
u|,#
 EL)#
 &	#

 #+4.#
 +,#
 
#
 #
 #
 ^ #
 #
 #
 #
 #
r4   rL  )rL  r4  r  r   )r}   );re   collections.abcrr   rA  typingr   r   r   r#   r   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_vitr   
get_loggerrb   loggerModuler   r)   rg   floatr   r   r   r   r   r   r   r   r   r  r  r4  rL  __all__r   r4   r3   <module>re     s          , , , , , , , , , ,        ! ! ! ! ! ! 9 9 9 9 9 9            G F F F F F F F & & & & & & Q Q Q Q Q Q Q Q K K K K K K K K K K K K A A A A A A A A ( ( ( ( ( ( 
	H	%	%U U U U UBI U U Up$ $ $ $ $ $ $ $\ % %I%<% 
% <	%
 U\*% % % % % %<1. 1. 1. 1. 1.ry 1. 1. 1.h    BI   "    29   >    bi   
 
 
 
 
	 
 
 
    )   <@ @ @ @ @ @ @ @ */ */ */ */ */ */ */ */Z Gj Gj Gj Gj Gj! Gj Gj GjT    	    	  d
 d
 d
 d
 d
 2 d
 d
 d
N   2
 2
 2
 2
 2
 2 2
 2
 2
j g
f
fr4   