
     `i                     .   d dl mZmZ d dlZd dlmZ d dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ  G d
 de          Ze G d de                      Z G d dee          Z ed           G d dee                      Zg dZdS )    )OptionalUnionN)IJepaConfig   )BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc            	            e Zd Zddededdf fdZdej        ded	edej        fd
Z		 	 ddej        de
ej                 dedej        fdZ xZS )IJepaEmbeddingsFconfiguse_mask_tokenreturnNc                     t                                          ||           | `| j        j        }t          j        t          j        d||j	                            | _
        d S )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__s       {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/ijepa/modular_ijepa.pyr   zIJepaEmbeddings.__init__   sV    000N+7#%<A{FL^0_0_#`#`       
embeddingsheightwidthc                    |j         d         }| j        j         d         }t          j                                        s||k    r||k    r| j        S | j        }|j         d         }|| j        z  }|| j        z  }	t          |dz            }
|                    d|
|
|          }|                    dddd          }t          j
                            |||	fdd	          }|                    dddd                              dd|          }|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper#   r    jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r$   r(   r)   r*   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r&   interpolate_pos_encodingz(IJepaEmbeddings.interpolate_pos_encoding   s*    !&q)06q9 y##%% 	,+*F*F6UZ??++2r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNr'   pixel_valuesbool_masked_posr@   c                    |j         \  }}}}|                     ||          }|_|j         d         }	| j                            ||	d          }
|                    d                              |
          }|d|z
  z  |
|z  z   }|r||                     |||          z   }n
|| j        z   }|                     |          }|S )N)r@   r   r,         ?)	r1   r   
mask_tokenexpand	unsqueezetype_asr@   r#   dropout)r$   rA   rB   r@   
batch_size_r)   r*   r(   
seq_lengthmask_tokensmasks               r&   forwardzIJepaEmbeddings.forward=   s     (4'9$
Avu**<Rj*kk
&#)!,J/00ZLLK",,R0088EED#sTz2[45GGJ $ 	?#d&C&CJPVX]&^&^^JJ#d&>>J\\*--
r'   )F)NF)__name__
__module____qualname__r   boolr   r    Tensorintr@   r   
BoolTensorrO   __classcell__r%   s   @r&   r   r      s        a a{ aD aT a a a a a a%5< % %UX %]b]i % % % %T 7;).	 l "%"23 #'	
 
       r'   r   c                   N    e Zd Zdeej        ej        ej        f         ddfdZdS )IJepaPreTrainedModelmoduler   Nc                    t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t$                    rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j        "|j        j                                         dS dS dS )zInitialize the weightsg        )meanstdNrD   )
isinstancer   LinearConv2dinittrunc_normal_weightdatator    float32r   initializer_rangedtypebiaszero_	LayerNormfill_r   r#   rE   )r$   r[   s     r&   _init_weightsz"IJepaPreTrainedModel._init_weightsZ   s   fry")455 	/ "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 
	/K""$$$M$$S)))))00 	/.0g.C.C*/225=AAK1 /D / / b+122	 &+
  ,!&,,.....	/ 	/ -,r'   )	rP   rQ   rR   r   r   r`   ra   rl   rn    r'   r&   rZ   rZ   X   sI        /E")RY*L$M /RV / / / / / /r'   rZ   c                   .     e Zd Zddededef fdZ xZS )
IJepaModelFr   add_pooling_layerr   c                     t                                          |           || _        t          ||          | _        dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r(   )r$   r   rr   r   r%   s       r&   r   zIJepaModel.__init__r   s<     	   )&PPPr'   )FF)rP   rQ   rR   r   rS   r   rW   rX   s   @r&   rq   rq   q   sf        	Q 	Q{ 	Qt 	Q]a 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Q 	Qr'   rq   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                        e Zd Zdef fdZ	 	 	 	 ddeej                 deej                 deej                 dee         de	e
         d	efd
Z xZS )IJepaForImageClassificationr   c                     t                                          |           t          |d          | _        |                                  d S )NF)rr   )r   r   rq   ijepa	post_init)r$   r   r%   s     r&   r   z$IJepaForImageClassification.__init__   sE       %@@@
r'   NrA   	head_masklabelsr@   kwargsr   c                      | j         |f||d|}|j        }|                     |                    d                    }d}	| | j        ||| j        fi |}	t          |	||j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )rz   r@   r   )r<   N)losslogitshidden_states
attentions)	rx   last_hidden_state
classifierr]   loss_functionr   r   r   r   )
r$   rA   rz   r{   r@   r|   outputssequence_outputr   r~   s
             r&   rO   z#IJepaForImageClassification.forward   s     /9dj/
%=/
 /
 	/
 /
 "3!5!5!!5!<!<==%4%ffdkLLVLLD$!/)	
 
 
 	
r'   )NNNN)rP   rQ   rR   r   r   r   r    rT   rS   r	   r
   r   rO   rW   rX   s   @r&   rv   rv   ~   s        {       04,0)-37!
 !
u|,!
 EL)!
 &	!

 #+4.!
 +,!
 
!
 !
 !
 !
 !
 !
 !
 !
r'   rv   )rZ   rq   rv   )typingr   r   r    torch.nnr   -transformers.models.ijepa.configuration_ijepar   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   vit.modeling_vitr   r   r   r   r   rZ   rq   rv   __all__ro   r'   r&   <module>r      s   " " " " " " " "        E E E E E E Q Q Q Q Q Q Q Q & & & & & & B B B B B B B B B B e e e e e e e e e e e eG G G G Gm G G GT / / / / /- / / /0
Q 
Q 
Q 
Q 
Q%x 
Q 
Q 
Q   '
 '
 '
 '
 '
"68Q '
 '
 '
T  r'   