
     `i                     B   d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e j'        e(          Z) G d de
j*                  Z+ G d de
j*                  Z,	 d@de
j*        de	j-        de	j-        de	j-        dee	j-                 de.de.fdZ/ G d de
j*                  Z0 G d  d!e
j*                  Z1 G d" d#e
j*                  Z2 G d$ d%e
j*                  Z3 G d& d'e
j*                  Z4 G d( d)e          Z5 G d* d+e
j*                  Z6e G d, d-e                      Z7e G d. d/e7                      Z8 G d0 d1e
j*                  Z9 ed23           G d4 d5e7                      Z: ed63           G d7 d8e7                      Z;e ed93           G d: d;e                                  Z< ed<3           G d= d>e7                      Z=g d?Z>dS )AzPyTorch DeiT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )
DeiTConfigc            	            e Zd ZdZddededdf fdZdej        d	e	d
e	dej        fdZ
	 	 ddej        deej                 dedej        fdZ xZS )DeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                 z   t                                                       t          j        t	          j        dd|j                            | _        t          j        t	          j        dd|j                            | _        |r-t          j        t	          j        dd|j                            nd | _	        t          |          | _        | j        j        }t          j        t	          j        d|dz   |j                            | _        t          j        |j                  | _        |j        | _        d S )Nr      )super__init__r   	Parametertorchzeroshidden_size	cls_tokendistillation_token
mask_tokenDeiTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_size)selfr   r    r/   	__class__s       z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/deit/modeling_deit.pyr%   zDeiTEmbeddings.__init__0   s    ek!Q8J&K&KLL"$,u{1aAS/T/T"U"UQ_i",u{1a9K'L'LMMMei 3F ; ;+7#%<A{QPVPb0c0c#d#d z&"<== +    
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r#   N      ?r   r   bicubicF)sizemodealign_cornersdim)shaper0   r'   jit
is_tracingr4   r   reshapepermuter   
functionalinterpolateviewcat)r5   r9   r:   r;   r/   num_positionsclass_and_dist_pos_embedpatch_pos_embedrD   
new_height	new_widthsqrt_num_positionss               r7   interpolate_pos_encodingz'DeiTEmbeddings.interpolate_pos_encoding<   st    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++#'#;AAArrE#B 2111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy2OD!LLLLr8   pixel_valuesbool_masked_posrT   c                 6   |j         \  }}}}|                     |          }|                                \  }}	}|R| j                            ||	d          }
|                    d                              |
          }|d|z
  z  |
|z  z   }| j                            |dd          }| j                            |dd          }t          j
        |||fd          }| j        }|r|                     |||          }||z   }|                     |          }|S )Nr=         ?r   rC   )rE   r.   r@   r,   expand	unsqueezetype_asr*   r+   r'   rM   r0   rT   r3   )r5   rU   rV   rT   _r:   r;   r9   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokensposition_embeddings                  r7   forwardzDeiTEmbeddings.forwardd   s4    +01fe**<88
$.OO$5$5!
J&/00ZLLK",,R0088EED#sTz2[45GGJ^**:r2>>
"5<<ZRPPY
,?LRSTTT
!5# 	Z!%!>!>z6SX!Y!Y"44
\\*--
r8   )F)NF)__name__
__module____qualname____doc__r   boolr%   r'   TensorintrT   r   
BoolTensorrd   __classcell__r6   s   @r7   r   r   +   s         
, 
,z 
,4 
,D 
, 
, 
, 
, 
, 
,&M5< &M &MUX &M]b]i &M &M &M &MV 7;).	 l "%"23 #'	
 
       r8   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )r-   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r$   r%   
image_sizer4   num_channelsr)   
isinstancecollectionsabcIterabler/   r   Conv2d
projection)r5   r   rs   r4   rt   r)   r/   r6   s          r7   r%   zDeiTPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir8   rU   r!   c                     |j         \  }}}}|| j        k    rt          d          |                     |                              d                              dd          }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r#   r   )rE   rt   
ValueErrorrz   flatten	transpose)r5   rU   r]   rt   r:   r;   xs          r7   rd   zDeiTPatchEmbeddings.forward   sl    2>2D/
L&%4,,,w   OOL))11!44>>q!DDr8   )	re   rf   rg   rh   r%   r'   rj   rd   rm   rn   s   @r7   r-   r-      sm         j j j j jEL U\        r8   r-           modulequerykeyvalueattention_maskscalingr3   c                    t          j        ||                    dd                    |z  }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }|||z  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr=   )rD   dtype)ptrainingr   r#   )r'   matmulr~   r   rJ   softmaxfloat32tor   r3   r   
contiguous)
r   r   r   r   r   r   r3   kwargsattn_weightsattn_outputs
             r7   eager_attention_forwardr      s     <s}}R'<'<==GL =((2U](SSVVW\WbccL =((6?([[L !#n4,|U33K''1--88::K$$r8   c            	            e Zd Zdef fdZ	 ddej        deej                 deej        ej        f         fdZ	 xZ
S )	DeiTSelfAttentionr   c                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          || _        |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        |j
        | _        | j        dz  | _        d| _        t          j        |j        | j	        |j                  | _        t          j        |j        | j	        |j                  | _        t          j        |j        | j	        |j                  | _        d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r$   r%   r)   num_attention_headshasattrr|   r   rk   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   r5   r   r6   s     r7   r%   zDeiTSelfAttention.__init__   sB    ::a??PVXhHiHi?76#5 7 737 7 7  
 #)#= #&v'9F<V'V#W#W !58PP"?/5Yv143EFO\\\
9V/1C&/ZZZYv143EFO\\\


r8   Nhidden_states	head_maskr!   c           
         |j         d         }|d| j        | j        f} |                     |          j        |                     dd          } |                     |          j        |                     dd          } |                     |          j        |                     dd          }t          }| j	        j
        dk    rt          | j	        j
                 } || ||||| j        | j        | j        sdn| j                  \  }	}
|	                                d d         | j        fz   }|	                    |          }	|	|
fS )	Nr   r=   r   r#   eagerr   )r   r   r3   r   )rE   r   r   r   rL   r~   r   r   r   r   _attn_implementationr   r   r   r   r   r@   r   rH   )r5   r   r   r]   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r7   rd   zDeiTSelfAttention.forward   sY    #(+
D$<d>VV	0DHH]++0)<FFq!LL	4djj//4i@JJ1aPP4djj//4i@JJ1aPP(?;+w66"9$+:Z"[)<)<nL#}CCC$2C	*
 	*
 	*
& #0"4"4"6"6ss";t?Q>S"S%--.EFFo--r8   N)re   rf   rg   r   r%   r'   rj   r   tuplerd   rm   rn   s   @r7   r   r      s        ]z ] ] ] ] ] ]* PT. ."\.6>u|6L.	u|U\)	*. . . . . . . .r8   r   c                   Z     e Zd ZdZdef fdZdej        dej        dej        fdZ xZ	S )DeiTSelfOutputz
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )	r$   r%   r   r   r)   denser1   r2   r3   r   s     r7   r%   zDeiTSelfOutput.__init__   sJ    Yv163EFF
z&"<==r8   r   input_tensorr!   c                 Z    |                      |          }|                     |          }|S r   r   r3   r5   r   r   s      r7   rd   zDeiTSelfOutput.forward  s*    

=11]33r8   )
re   rf   rg   rh   r   r%   r'   rj   rd   rm   rn   s   @r7   r   r      s         
>z > > > > > >
U\  RWR^        r8   r   c                   |     e Zd Zdef fdZdee         fdZd
dej	        de
ej	                 dej	        fd	Z xZS )DeiTAttentionr   c                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )r$   r%   r   	attentionr   outputsetpruned_headsr   s     r7   r%   zDeiTAttention.__init__  sI    *622$V,,EEr8   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rC   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r5   r   indexs      r7   prune_headszDeiTAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r8   Nr   r   r!   c                 d    |                      ||          \  }}|                     ||          }|S r   )r   r   )r5   r   r   self_attn_outputr\   r   s         r7   rd   zDeiTAttention.forward$  s4    "nn]IFF!-}==r8   r   )re   rf   rg   r   r%   r   rk   r   r'   rj   r   rd   rm   rn   s   @r7   r   r     s        "z " " " " " ";S ; ; ; ;$ U\ hu|>T `e`l        r8   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )DeiTIntermediater   c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r$   r%   r   r   r)   intermediate_sizer   ru   
hidden_actstrr	   intermediate_act_fnr   s     r7   r%   zDeiTIntermediate.__init__,  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r8   r   r!   c                 Z    |                      |          }|                     |          }|S r   )r   r   )r5   r   s     r7   rd   zDeiTIntermediate.forward4  s,    

=1100??r8   	re   rf   rg   r   r%   r'   rj   rd   rm   rn   s   @r7   r   r   +  sj        9z 9 9 9 9 9 9U\ el        r8   r   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )
DeiTOutputr   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r   )
r$   r%   r   r   r   r)   r   r1   r2   r3   r   s     r7   r%   zDeiTOutput.__init__<  sJ    Yv79KLL
z&"<==r8   r   r   r!   c                 d    |                      |          }|                     |          }||z   }|S r   r   r   s      r7   rd   zDeiTOutput.forwardA  s4    

=11]33%4r8   r   rn   s   @r7   r   r   ;  su        >z > > > > > >
U\  RWR^        r8   r   c                   h     e Zd ZdZdef fdZd	dej        deej                 dej        fdZ	 xZ
S )
	DeiTLayerz?This corresponds to the Block class in the timm implementation.r   c                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r$   r%   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr)   layer_norm_epslayernorm_beforelayernorm_afterr   s     r7   r%   zDeiTLayer.__init__L  s    '-'E$&v..,V44 (( "V-?VEZ [ [ [!|F,>FDYZZZr8   Nr   r   r!   c                     |                      |          }|                     ||          }||z   }|                     |          }|                     |          }|                     ||          }|S r   )r   r   r   r   r   )r5   r   r   hidden_states_normattention_outputlayer_outputs         r7   rd   zDeiTLayer.forwardV  sz    !22=AA>>*<iHH )=8 ++M::((66 {{<??r8   r   )re   rf   rg   rh   r   r%   r'   rj   r   rd   rm   rn   s   @r7   r   r   I  s        II[z [ [ [ [ [ [ U\ hu|>T `e`l        r8   r   c                   Z     e Zd Zdef fdZddej        deej                 defdZ	 xZ
S )	DeiTEncoderr   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0r\   r   s     r7   
<listcomp>z(DeiTEncoder.__init__.<locals>.<listcomp>l  s!    #_#_#_!If$5$5#_#_#_r8   F)	r$   r%   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r7   r%   zDeiTEncoder.__init__i  s`    ]#_#_#_#_uVE]?^?^#_#_#_``
&+###r8   Nr   r   r!   c                     t          | j                  D ]\  }}|||         nd } |||          }t          |          S )N)last_hidden_state)	enumerater   r   )r5   r   r   ilayer_modulelayer_head_masks         r7   rd   zDeiTEncoder.forwardo  sW    (44 	I 	IOA|.7.CillO(LHHMM????r8   r   )re   rf   rg   r   r%   r'   rj   r   r   rd   rm   rn   s   @r7   r   r   h  s        ,z , , , , , ,@ @U\ @hu|>T @`o @ @ @ @ @ @ @ @r8   r   c                       e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZeedZdeej        ej        ej        f         dd	fd
Zd	S )DeiTPreTrainedModelr   deitrU   Tr   )r   
attentionsr   r!   Nc                 z   t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t$                    r|j        j                                         |j        j                                         |j        j                                         |j        "|j        j                                         dS dS dS )zInitialize the weightsr   )meanstdNrX   )ru   r   r   ry   inittrunc_normal_weightdatar   r'   r   r   initializer_ranger   r   zero_r   fill_r   r*   r0   r+   r,   )r5   r   s     r7   _init_weightsz!DeiTPreTrainedModel._init_weights  s   fry")455 	/ "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	/K""$$$M$$S)))))// 	/!'')))&+11333%*00222 ,!&,,.....	/ 	/ -,r8   )re   rf   rg   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r   r   ry   r   r  r   r8   r7   r   r   w  s         $O&*#$N"&"' 
/E")RY*L$M /RV / / / / / /r8   r   c                        e Zd Zddedededdf fdZdefd	Zd
 Z e	d          e
	 	 	 	 ddeej                 deej                 deej                 dedee         defd                        Z xZS )	DeiTModelTFr   add_pooling_layerr    r!   Nc                 N   t                                          |           || _        t          ||          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd| _        |                                  dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r    r   N)r$   r%   r   r   r9   r   encoderr   r   r)   r   	layernorm
DeiTPoolerpooler	post_init)r5   r   r  r    r6   s       r7   r%   zDeiTModel.__init__  s     	   (OOO"6**f&8f>STTT,=Gj(((4 	r8   c                     | j         j        S r   )r9   r.   )r5   s    r7   get_input_embeddingszDeiTModel.get_input_embeddings  s    //r8   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r5   heads_to_pruner   r   s       r7   _prune_headszDeiTModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr8   )tie_last_hidden_statesrU   rV   r   rT   r   c                    |t          d          |                     || j        j                  }| j        j        j        j        j        }|j        |k    r|	                    |          }|                     |||          }| 
                    ||          }|j        }	|                     |	          }	| j        |                     |	          nd}
t          |	|
          S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rV   rT   )r   )r   pooler_output)r|   get_head_maskr   r   r9   r.   rz   r  r   r   r  r   r  r  r   )r5   rU   rV   r   rT   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputs              r7   rd   zDeiTModel.forward  s     ?@@@ &&y$+2OPP	 9DKQ//'??>::L??/Tl + 
 
 ,0<<8HT]<+^+^);..998<8OO444UY)-'
 
 
 	
r8   )TFNNNF)re   rf   rg   r   ri   r%   r-   r  r!  r   r   r   r'   rj   rl   r   r   r   rd   rm   rn   s   @r7   r  r    s0        z d [_ lp      &0&9 0 0 0 0C C C u555 046:,0).(
 (
u|,(
 "%"23(
 EL)	(

 #'(
 +,(
 
$(
 (
 (
 ^ 65(
 (
 (
 (
 (
r8   r  c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )r  r   c                     t                                                       t          j        |j        |j                  | _        t          |j                 | _	        d S r   )
r$   r%   r   r   r)   pooler_output_sizer   r	   
pooler_act
activationr   s     r7   r%   zDeiTPooler.__init__  sE    Yv163LMM
 !23r8   r   r!   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r0  )r5   r   first_token_tensorr*  s       r7   rd   zDeiTPooler.forward  s@     +111a40

#56666r8   r   rn   s   @r7   r  r    sj        4z 4 4 4 4 4 4
U\ el        r8   r  ad  
    DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                        e Zd Zdeddf fdZee	 	 	 	 ddeej	                 deej
                 deej	                 d	ed
ee         defd                        Z xZS )DeiTForMaskedImageModelingr   r!   Nc                 V   t                                          |           t          |dd          | _        t	          j        t	          j        |j        |j        dz  |j	        z  d          t	          j
        |j                            | _        |                                  d S )NFT)r  r    r#   r   )in_channelsout_channelsrq   )r$   r%   r  r   r   
Sequentialry   r)   encoder_stridert   PixelShuffledecoderr  r   s     r7   r%   z#DeiTForMaskedImageModeling.__init__  s       fdSSS	}I".#2A58KK  
 OF122
 
 	r8   FrU   rV   r   rT   r   c                 6    | j         |f|||d|}|j        }|ddddf         }|j        \  }}	}
t          |	dz            x}}|                    ddd                              ||
||          }|                     |          }d}|| j        j        | j        j	        z  }|                    d||          }|
                    | j        j	        d          
                    | j        j	        d                              d                                          }t          j                            ||d	          }||z                                  |                                d
z   z  | j        j        z  }t%          |||j        |j                  S )a;  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```)rV   r   rT   Nr   r=   r>   r   r#   none)	reductiongh㈵>)lossreconstructionr   r   )r   r   rE   rk   rI   rH   r<  r   rs   r4   repeat_interleaverZ   r   r   rJ   l1_losssumrt   r   r   r   )r5   rU   rV   r   rT   r   outputsr)  r]   sequence_lengthrt   r:   r;   reconstructed_pixel_valuesmasked_im_lossr@   r`   reconstruction_losss                     r7   rd   z"DeiTForMaskedImageModeling.forward  s   L /8di/
+%=	/
 /

 /
 /
 "3 *!!!QrT'24C4I1
O\_c1222)11!Q::BB:|]cejkk &*\\/%B%B"&;)T[-CCD-55b$EEO11$+2H!LL""4;#91==1	  #%-"7"7F`lr"7"s"s1D8==??488::PTCTUX\XcXppN(5!/)	
 
 
 	
r8   r+  )re   rf   rg   r   r%   r   r   r   r'   rj   rl   ri   r   r   r   rd   rm   rn   s   @r7   r5  r5    s        z d      "  046:,0).I
 I
u|,I
 "%"23I
 EL)	I

 #'I
 +,I
 
#I
 I
 I
 ^ I
 I
 I
 I
 I
r8   r5  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZee	 	 	 	 ddeej	                 deej	                 deej	                 d	e
d
ee         defd                        Z xZS )DeiTForImageClassificationr   r!   Nc                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S NF)r  r   )r$   r%   
num_labelsr  r   r   r   r)   Identity
classifierr  r   s     r7   r%   z#DeiTForImageClassification.__init__l  s        +f>>>	 OUN_bcNcNc")F$68IJJJikitiviv 	r8   FrU   r   labelsrT   r   c                      | j         |f||d|}|j        }|                     |dddddf                   }d}	| | j        ||| j        fi |}	t          |	||j        |j                  S )aZ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: Polaroid camera, Polaroid Land camera
        ```r   rT   Nr   )r@  logitsr   r   )r   r   rP  loss_functionr   r   r   r   )
r5   rU   r   rQ  rT   r   rE  r)  rT  r@  s
             r7   rd   z"DeiTForImageClassification.forwardx  s    T /8di/
%=/
 /
 	/
 /
 "3Aqqq!9:: %4%ffdkLLVLLD$!/)	
 
 
 	
r8   r+  )re   rf   rg   r   r%   r   r   r   r'   rj   ri   r   r   r   rd   rm   rn   s   @r7   rK  rK  e  s        
z 
d 
 
 
 
 
 
  04,0)-).=
 =
u|,=
 EL)=
 &	=

 #'=
 +,=
 
=
 =
 =
 ^ =
 =
 =
 =
 =
r8   rK  zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeeej                          ed<   dZeeej                          ed<   dS )+DeiTForImageClassificationWithTeacherOutputaj  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the cls_logits and distillation logits.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    NrT  
cls_logitsdistillation_logitsr   r   )re   rf   rg   rh   rT  r   r'   FloatTensorr	  rX  rY  r   r   r   r   r8   r7   rW  rW    s         	 	 +/FHU&'....2J*+2227;%"34;;;8<M8E%"345<<<59Ju01299999r8   rW  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                        e Zd Zdeddf fdZee	 	 	 ddeej	                 deej	                 de
d	ee         def
d
                        Z xZS )%DeiTForImageClassificationWithTeacherr   r!   Nc                    t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        |j        dk    rt          j        |j        |j                  nt          j                    | _
        |                                  d S rM  )r$   r%   rN  r  r   r   r   r)   rO  cls_classifierdistillation_classifierr  r   s     r7   r%   z.DeiTForImageClassificationWithTeacher.__init__  s        +f>>>	 AG@QTU@U@UBIf(&*;<<<[][f[h[h 	 AG@QTU@U@UBIf(&*;<<<[][f[h[h 	$
 	r8   FrU   r   rT   r   c                     | j         |f||d|}|j        }|                     |d d dd d f                   }|                     |d d dd d f                   }||z   dz  }	t	          |	|||j        |j                  S )NrS  r   r   r#   )rT  rX  rY  r   r   )r   r   r^  r_  rW  r   r   )
r5   rU   r   rT   r   rE  r)  rX  rY  rT  s
             r7   rd   z-DeiTForImageClassificationWithTeacher.forward  s     /8di/
%=/
 /
 	/
 /
 "3((Aqqq)ABB
"::?111aQRQRQR7;STT 22a7:! 3!/)
 
 
 	
r8   )NNF)re   rf   rg   r   r%   r   r   r   r'   rj   ri   r   r   rW  rd   rm   rn   s   @r7   r\  r\    s        z d      "  04,0).	
 
u|,
 EL)
 #'	

 +,
 
5
 
 
 ^ 
 
 
 
 
r8   r\  )rK  r\  r5  r  r   )r   )?rh   collections.abcrv   dataclassesr   typingr   r   r   r'   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   utils.genericr   r   configuration_deitr   
get_loggerre   loggerModuler   r-   rj   floatr   r   r   r   r   r   r   r   r   r  r  r5  rK  rW  r\  __all__r   r8   r7   <module>rr     s         ! ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! ! 9 9 9 9 9 9            G F F F F F F F & & & & & & Q Q Q Q Q Q Q Q X X X X X X X X X X X X X X A A A A A A A A * * * * * * 
	H	%	%V V V V VRY V V Vr    ")   P % %I%<% 
% <	%
 U\*% % % % % %>1. 1. 1. 1. 1.	 1. 1. 1.j    RY   $    BI   @    ry    
 
 
 
 
 
 
 
    *   >@ @ @ @ @") @ @ @ !/ !/ !/ !/ !// !/ !/ !/H I
 I
 I
 I
 I
# I
 I
 I
Z        	  ]
 ]
 ]
 ]
 ]
!4 ]
 ]
 ]
@   L
 L
 L
 L
 L
!4 L
 L
 L
^   
: : : : :+ : :  :& 
 
 
0
 0
 0
 0
 0
,? 0
 0

 
0
f  r8   