
     `i                        d dl mZ d dlmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ  ej        e          Ze ed           G d de                                  Ze ed           G d de                                  Z G d dej                  Z G d dej                  Z 	 dLdej        dej!        dej!        dej!        deej!                 de"de"fd Z#d! Z$ G d" d#ej                  Z%dMd%ej!        d&e"d'e&d(ej!        fd)Z' G d* d+ej                  Z( G d, d-ej                  Z) G d. d/e          Z* G d0 d1ej                  Z+d2ej!        d3e,ej!                 d(ej!        fd4Z- G d5 d6ej                  Z. G d7 d8ej                  Z/ G d9 d:ej                  Z0 G d; d<ej                  Z1 G d= d>e          Z2 G d? d@e          Z3 G dA dBej                  Z4e G dC dDe                      Z5dE Z6e G dF dGe5                      Z7 edH           G dI dJe5                      Z8g dKZ9dS )N    )	dataclass)CallableOptionalUnionN)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging   )VJEPA2ConfigzO
    VJEPA Predictor outputs that also contains the masked encoder outputs
    )custom_introc                       e Zd ZU dZej        ed<   dZeej                 ed<   dZ	ee
ej        df                  ed<   dZee
ej        df                  ed<   dZeej                 ed<   dS )	$VJEPA2WithMaskedInputPredictorOutputa  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    target_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `target_mask` is provided which is applied on VJEPA2Encoder outputs):
        The target hidden state of the model.
    last_hidden_stateNmasked_hidden_state.hidden_states
attentionstarget_hidden_state)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   r   tupler   r        ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/vjepa2/modeling_vjepa2.pyr   r       s           ((((7;%"34;;;=AM8E%"3S"89:AAA:>Ju0#567>>>7;%"34;;;;;r&   r   zs
    VJEPA outputs that also contains the masked encoder outputs
    Optionally contains the predictor outputs
    c                        e Zd ZU dZej        ed<   dZeej                 ed<   dZ	ee
ej        df                  ed<   dZee
ej        df                  ed<   dZee         ed<    fd	Z xZS )
 VJEPA2WithMaskedInputModelOutputaq  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    predictor_output (`VJEPA2WithMaskedInputPredictorOutput`, *optional*):
        The output from the Predictor module.
    r   Nr   .r   r   predictor_outputc                     t          t                                                                }t          |d         t                    r|d                                         |d<   t          |          S )N)listsuperto_tuple
isinstancer   r$   )selfoutput	__class__s     r'   r/   z)VJEPA2WithMaskedInputModelOutput.to_tupleJ   s\    egg&&(())fRj"FGG 	/,,..F2JV}}r&   )r   r   r   r    r!   r"   r#   r   r   r   r$   r   r*   r   r/   __classcell__r3   s   @r'   r)   r)   5   s           ((((7;%"34;;;=AM8E%"3S"89:AAA:>Ju0#567>>>GKhCDKKK        r&   r)   c                   j     e Zd ZdZ	 d
dedef fdZed             Zde	j
        de	j
        fd	Z xZS )VJEPA2PatchEmbeddings3Dz"
    Image to Patch Embedding
       confighidden_sizec                    t                                                       |j        | _        |j        | _        || _        t          j        |j        ||j        |j        |j        f|j        |j        |j        f          | _        d S )N)in_channelsout_channelskernel_sizestride)	r.   __init__
patch_sizetubelet_sizer:   r   Conv3din_chansprojr1   r9   r:   r3   s      r'   r@   z VJEPA2PatchEmbeddings3D.__init__V   s    
 	 +"/&I$,f.?ARS'):F<MN	
 
 
			r&   c                 `    | j         | j        z  | j        | j        z  z  | j        | j        z  z  S Nframes_per_cliprB   	crop_sizerA   r9   s    r'   num_patchesz#VJEPA2PatchEmbeddings3D.num_patchesg   s=     #v'::6#4466#446	
r&   pixel_values_videosreturnc                 ~    |                      |                              d                              dd          }|S )N   r   )rE   flatten	transpose)r1   rN   xs      r'   forwardzVJEPA2PatchEmbeddings3D.forwardo   s7    II)**22155??1EEr&   r8   )r   r   r   r    r   intr@   staticmethodrM   r!   TensorrU   r4   r5   s   @r'   r7   r7   Q   s           
 

 
 
 
 
 
 
" 
 
 \
5< EL        r&   r7   c                   R     e Zd ZdZd	dedef fdZdej        dej        fdZ	 xZ
S )
VJEPA2Embeddings>
    Construct mask token, position and patch embeddings.
    r8   r9   r:   c                     t                                                       || _        || _        t	          ||          | _        | j        j        | _        |j        | _        d S )Nr:   )r.   r@   r9   r:   r7   patch_embeddingsrM   rA   rF   s      r'   r@   zVJEPA2Embeddings.__init__y   sY    & 7K X X X0< +r&   rN   rO   c                 :   |j         d         }|                    ddddd          }|| j        j        k     r#|                    dd| j        j        dd          }| j        j        j        j        }|	                    |          }|                     |          }|S )Nr   r   rQ   r      )dtype)
shapepermuter9   rB   repeatr_   rE   weightrb   to)r1   rN   
num_framestarget_dtype
embeddingss        r'   rU   zVJEPA2Embeddings.forward   s    (.q1
 299!Q1aHH 000"5"<"<Q4;C[]^`a"b"b,18>144<4HH**+>??
r&   rV   )r   r   r   r    r   rW   r@   r!   rY   rU   r4   r5   s   @r'   r[   r[   t   s         , ,| ,# , , , , , ,5< EL        r&   r[           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }|||z  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr,   )dimrb   )ptrainingr   rQ   )r!   matmulrS   r   
functionalsoftmaxfloat32rg   rb   rr   rw   
contiguous)
rl   rm   rn   ro   rp   rq   rr   kwargsattn_weightsattn_outputs
             r'   eager_attention_forwardr      s     <s}}R'<'<==GL =((2U](SSVVW\WbccL =((6?([[L !#n4,|U33K''1--88::K$$r&   c                    |                                  \  }}}}t          j        |dz  | j        | j                  }||dz  z  }dd|z  z  }|                    d          |z  }|                                }|                                }	|                    d          	                    dddd          }|	                    d          	                    dddd          }	| 
                    dd          }
|
                    d	          \  }}t          j        | |fd	          }
|
                    d
          }
| |	z  |
|z  z   S )NrQ   rb   deviceg       @      ?i'  r,   r   )r,   rQ   ru   rt   )sizer!   arangerb   r   	unsqueezesincossqueezere   	unflattenunbindstackrR   )rT   posB	num_headsNDomegafreqemb_sinemb_cosyy1y2s                r'   rotate_queries_or_keysr      s8   Ay!Q
 Laqwqx@@@E	QWE%,E==u$D hhjjGhhjjGoob!!((Aq!44Goob!!((Aq!44G 	
B  AXX"XFBbS"I2&&&A			"AKAK((r&   c                        e Zd Z	 	 ddededef fdZd Zd Zdd
Zd Z		 	 	 dde
ej                 dede
ej                 deeej        ej        f         eej                 f         fdZ xZS )VJEPA2RopeAttentionr8      r9   r:   num_attention_headsc                    t                                                       || _        || _        || _        ||z  dk    rt          d|f d| d          t          ||z            | _        | j        | j        z  | _        t          j
        || j        |j                  | _        t          j
        || j        |j                  | _        t          j
        || j        |j                  | _        t          j
        ||          | _        |j        | _        t          j        | j                  | _        | j        j        | j        j        z  | _        | j        j        | j        j        z  | _        t          d| j        dz  dz  z            | _        t          d| j        dz  dz  z            | _        t          d| j        dz  dz  z            | _        | j        dz  | _        d	| _        d S )
Nr   zThe hidden size z4 is not a multiple of the number of attention heads .biasrQ   r         F)r.   r@   r9   r:   r   
ValueErrorrW   attention_head_sizeall_head_sizer   Linearqkv_biasrm   rn   ro   rE   attention_probs_dropout_probdropout_probDropoutrr   rK   rA   	grid_sizerJ   rB   
grid_depthd_dimh_dimw_dimrq   	is_causal)r1   r9   r:   r   r3   s       r'   r@   zVJEPA2RopeAttention.__init__   s    	&#6 ,,110K> 0 0,0 0 0  
 $'{5H'H#I#I !58PPY{D,>V_UUU
9[$*<6?SSSY{D,>V_UUU
Ik;77	"?z$"344.$+2HH+59QQt71<BCDD
t71<BCDD
t71<BCDD
/5r&   c                 D    t          | j        | j        z            }||z  S rH   )rW   r   )r1   idstokens_per_frames      r'   _get_frame_posz"VJEPA2RopeAttention._get_frame_pos   s%    t~>??&&&r&   c                     t          | j        | j        z            }|                     |          }|||z  z
  }| j        }||z  S rH   )rW   r   r   )r1   r   r   	frame_idstokens_per_rows        r'   _get_height_posz#VJEPA2RopeAttention._get_height_pos   sN    t~>??'',,	$y00n$$r&   Nc                    |j         }|                    d          }|0|                    d                              d| j        d          }nt          j        ||          }t          | j        | j        z            }| 	                    |          }| j        }| 
                    |          }	|||z  z
  ||	z  z
  }
||	|
fS )Nr   r   )r   r   r   re   r   r!   r   rW   r   r   r   )r1   rT   masksr   
token_sizer   r   r   r   
height_ids	width_idss              r'   get_position_idsz$VJEPA2RopeAttention.get_position_ids  s    VVAYY
 //!$$++At/GKKCC,z&999Ct~>??'',,	))#..
 +i77>J;VV	*i//r&   c                    |\  }}}d}t          |d||| j        z   f         |          }|| j        z  }t          |d||| j        z   f         |          }|| j        z  }t          |d||| j        z   f         |          }	|| j        z  }|| j        k     r'|d|d f         }
t          j        |||	|
gd          }nt          j        |||	gd          }|S )Nr   .)r   r,   r   )r   r   r   r   r   r!   cat)r1   qkpos_idsd_maskh_maskw_masksqkdqkhqkwqkrs              r'   apply_rotary_embeddingsz+VJEPA2RopeAttention.apply_rotary_embeddings  s   !($RQTZ-?(?%@fMMM	TZ$RQTZ-?(?%@fMMM	TZ$RQTZ-?(?%@fMMM	TZt'''S!""W+CCc3/R888BBCc?333B	r&   Fposition_maskoutput_attentions	head_maskrO   c           
         |j         \  }}}|                     |                              |d| j        | j                                      dd          }|                     |                              |d| j        | j                                      dd          }	|                     |                              |d| j        | j                                      dd          }
|                     ||          }| 	                    |	|          }	| 	                    ||          }t          }| j        j        dk    rt          | j        j                 } || ||	|
|| j        | j        | j        sdn| j                  \  }}|                                d d         | j        fz   }|                     |                    |                    }|r||fn|f}|S )	Nr,   r   rQ   )r   eagerrk   r   rq   rr   rt   )rc   rm   viewr   r   rS   rn   ro   r   r   r   r9   _attn_implementationr   r   rq   rw   r   r   r   rE   reshape)r1   r   r   r   r   
batch_size
seq_length_query_layer	key_layervalue_layerr   attention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                    r'   rU   zVJEPA2RopeAttention.forward)  s    %2$7!
JJJ}%%T*b$":D<TUUYq!__ 	 HH]##T*b$":D<TUUYq!__ 	 JJ}%%T*b$":D<TUUYq!__ 	 '']'KK00GDD	22;HH(?;+w66"9$+:Z"[)<)<nL#}CCC$2C	*
 	*
 	*
& #0"4"4"6"6ss";t?Q>S"S		-"7"78O"P"PQQ6G]=/22mM]r&   )r8   r   rH   )NFN)r   r   r   r   rW   r@   r   r   r   r   r   r!   rY   boolr   r$   rU   r4   r5   s   @r'   r   r      s!         #%	# ## # !	# # # # # #J' ' '% % %0 0 0 0*  ( 15"',00 0  -0  	0
 EL)0 
uU\5</0%2EE	F0 0 0 0 0 0 0 0r&   r   Finput	drop_probrw   rO   c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    rk   r   r   r   r   )rc   ndimr!   randrb   r   floor_div)r   r   rw   	keep_probrc   random_tensorr2   s          r'   	drop_pathr   ]  s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr&   c                   f     e Zd ZdZd	dee         f fdZdej        dej        fdZ	de
fdZ xZS )
VJEPA2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   c                 V    t                                                       || _        d S rH   )r.   r@   r   )r1   r   r3   s     r'   r@   zVJEPA2DropPath.__init__u  s$    "r&   r   rO   c                 8    t          || j        | j                  S rH   )r   r   rw   )r1   r   s     r'   rU   zVJEPA2DropPath.forwardy  s    FFFr&   c                     d| j          S )Nzp=)r   r1   s    r'   
extra_reprzVJEPA2DropPath.extra_repr|  s    $DN$$$r&   rH   )r   r   r   r    r   floatr@   r!   rY   rU   strr   r4   r5   s   @r'   r   r   r  s        bb# #(5/ # # # # # #GU\ Gel G G G G%C % % % % % % % %r&   r   c                   R     e Zd Zd
dededef fdZdej        dej        fd	Z	 xZ
S )	VJEPA2MLPr8         @r9   r:   	mlp_ratioc                    t                                                       |x}}t          ||z            }t          j        ||d          | _        t          |j                 | _        t          j        ||d          | _	        d S NTr   )
r.   r@   rW   r   r   fc1r	   
hidden_act
activationfc2)r1   r9   r:   r   in_featuresout_featureshidden_featuresr3   s          r'   r@   zVJEPA2MLP.__init__  sx    %00lkI5669[/EEE !239_lFFFr&   hidden_staterO   c                     |                      |          }|                     |          }|                     |          }|S rH   )r   r   r  )r1   r  s     r'   rU   zVJEPA2MLP.forward  s;    xx--|44xx--r&   )r8   r   )r   r   r   r   rW   r   r@   r!   rY   rU   r4   r5   s   @r'   r   r     s        G G| G# GQV G G G G G GEL U\        r&   r   c                        e Zd ZdZ	 	 	 	 ddededed	ed
ef
 fdZ	 	 	 ddej	        de
ej	                 de
ej	                 dedeej	        df         f
dZ xZS )VJEPA2LayerzCThis corresponds to the Block class in the original implementation.rk   r8   r   r   r9   drop_path_rater:   r   r   c                    t                                                       || _        || _        || _        || _        t          j        ||j                  | _	        t          |||          | _        |j        dk    rt          |          nt          j                    | _        t          j        ||j                  | _        t#          |||          | _        d S )Nepsrk   )r:   r   )r.   r@   r9   r:   r   r   r   	LayerNormlayer_norm_epsnorm1r   	attentionr	  r   Identityr   norm2r   mlp)r1   r9   r	  r:   r   r   r3   s         r'   r@   zVJEPA2Layer.__init__  s     	&#6 "\+63HIII
,V[BUVV;A;PSV;V;V777\^\g\i\i\+63HIII
V	RRRr&   NFr   r   r   r   rO   .c                 N   |}|                      |          }|                     ||||          }|d         }|                     |          |z   }|}|                     |          }|                     |          }|                     |          |z   }|dd          }|f|z   }|S )N)r   r   r   r   r   )r  r  r   r  r  )	r1   r   r   r   r   residualself_attention_outputsattention_outputr   s	            r'   rU   zVJEPA2Layer.forward  s     !

=11!%'/	 "0 "
 "
 2!4'7888C !

=11//}55@ ), "W,r&   )rk   r8   r   r   )NNF)r   r   r   r    r   r   rW   r@   r!   rY   r   r   r$   rU   r4   r5   s   @r'   r  r    s       MM
 !$#%S SS S 	S
 !S S S S S S S. 15,0"' |  - EL)	
   
u|S 	!       r&   r  c                        e Zd Zdef fdZe	 	 	 	 ddeej                 deej                 de	de	d	e
f
d
            Z xZS )VJEPA2Encoderr9   c                    t                                                       | _        t          j                  | _        fdt          j                  D             t          j	        fdt          j                  D                       | _
        t          j        j        j                  | _        d| _        d S )Nr^   c                 T    g | ]$}j         d k    rj        |z  j         d z
  z  nd%S r   rk   )num_hidden_layersr	  .0ir9   s     r'   
<listcomp>z*VJEPA2Encoder.__init__.<locals>.<listcomp>  sT     
 
 
 LRKcfgKgKgV"Q&&*BQ*FGGmp
 
 
r&   c           	      b    g | ]+}t          |         j        j        j                   ,S )r	  r:   r   r   )r  r:   r   r   r  r   r9   drop_path_ratess     r'   r!  z*VJEPA2Encoder.__init__.<locals>.<listcomp>  sW     	 	 	  #21#5 & 2(.(B$.  	 	 	r&   r  F)r.   r@   r9   r[   r:   rj   ranger  r   
ModuleListlayerr  r  	layernormgradient_checkpointingr1   r9   r%  r3   s    `@r'   r@   zVJEPA2Encoder.__init__  s    *6v?QRRR
 
 
 
6344
 
 
 ]	 	 	 	 	 v788	 	 	
 

 f&8f>STTT&+###r&   NFrN   r   r   output_hidden_statesrO   c                 F   |rdnd }|rdnd }|                      |          }t          | j                  D ]=\  }	}
|r||fz   }|||	         nd } |
|d ||          }|d         }|r||d         fz   }>|                     |          }|r||fz   }t	          |||          S )Nr%   r   r   r   r   r   )rj   	enumerater(  r)  r   )r1   rN   r   r   r,  r}   all_hidden_statesall_self_attentionsr   r   layer_modulelayer_head_masklayer_outputss                r'   rU   zVJEPA2Encoder.forward  s    #7@BBD$5?bb4(;<<(44 		P 		POA|# I$58H$H!.7.CillO(LoO`aaM)!,M  P&9]1=M<O&O#}55 	E 1]4D D++*
 
 
 	
r&   )NNFF)r   r   r   r   r@   r   r   r!   rY   r   r   rU   r4   r5   s   @r'   r  r    s        ,| , , , , , ,0  7;,0"'%*!
 !
%el3!
 EL)!
  	!

 #!
 
!
 !
 !
 !
 !
 !
 !
 !
r&   r  tensorr   c                     g }|D ]t}|                     | j                  }|                    d                              dd|                     d                    }|t          j        | d|          gz  }ut          j        |d          S )z
    Args:
        tensor (`torch.Tensor`):
            Tensor of shape [batch_size, num_patches, feature_dim]
        masks (`List[torch.Tensor]`):
            List of tensors of shape [batch_size, num_patches] containing indices of patches to keep
    r,   r   ru   indexr   r   )rg   r   r   re   r   r!   gatherr   )r5  r   all_masked_tensorsmask	mask_keeps        r'   apply_masksr=    s      M Mwwv}%%NN2&&--aFKKOODD	u|FKKKLL9'Q////r&   c                        e Zd ZdZdef fdZed             Z	 ddej	        de
ej	                 de
ej	                 d	ed
eej	        ej	        f         f
dZ xZS )VJEPA2PredictorEmbeddingsr\   r9   c                 r   t                                                       || _        t          j        |j        |j                  | _        d| _        |j	        | _
        |j        | _        t          j        t          j        | j        dd|j                            | _        |j        | _        || _        d S )Nr   r   )r.   r@   r9   r   r   r:   pred_hidden_sizepredictor_embeddingsnum_mask_tokenspred_zero_init_mask_tokenszero_init_mask_tokenspred_num_mask_tokens	Parameterr!   zerosmask_tokensrA   r1   r9   r3   s     r'   r@   z"VJEPA2PredictorEmbeddings.__init__  s    $&If.@&BY$Z$Z! %+%F"%:<D4H!QPVPg(h(hii +r&   c                     | j         dk    r/| j         | j        z  | j        | j        z  z  | j        | j        z  z  S | j        | j        z  | j        | j        z  z  S )Nr   rI   rL   s    r'   rM   z%VJEPA2PredictorEmbeddings.num_patches(  sm    !A%%'6+>>#v'88:#v'88: $(99f>NRXRc>cddr&   r   r   context_masktarget_mask
mask_indexrO   c                    |                     d          }|                     |          }|| j        z  }| j        |         }|d                                         dz   }|                    ||d          }t          ||          }|                    t          |          dd          }t          j	        ||gd          }	t          j	        |d          }
t          j	        |d          }t          j	        |
|gd          }|	|fS )z
        hidden_states : encoder outputs (context)
        context_mask: tokens of the context (outputs from the encoder)
        target_mask: tokens to predict
        mask_index: index of the target mask to choose (useful for multiclip?)
        r   r   r   )
r   rB  rC  rI  maxre   r=  lenr!   r   )r1   r   rL  rM  rN  r   contexttargetmax_patch_numrj   cmtmr   s                r'   rU   z!VJEPA2PredictorEmbeddings.forward3  s    q!!++M::  $"66
!*- $A**,,q0q-33V[11 ..\!2!2Aq99Y0a888
 Y|+++Y{***	2r(***5  r&   r   )r   r   r   r    r   r@   rX   rM   r!   rY   r-   rW   r$   rU   r4   r5   s   @r'   r?  r?    s         |       e e \e &! &!|&! 5<(&! %,'	&!
 &! 
u|U\)	*&! &! &! &! &! &! &! &!r&   r?  c                        e Zd Zdef fdZddZd Ze	 	 	 ddej	        de
ej	                 d	e
ej	                 d
eej	                 dededefd            Z xZS )VJEPA2Predictorr9   c                    t                                                       | _        d| _        t	                    | _        fdt          j                  D             t          j	        fdt          j                  D                       | _
        t          j        j        j                  | _        t          j        j        j        d          | _        d S )NFc                 T    g | ]$}j         d k    rj        |z  j         d z
  z  nd%S r  )pred_num_hidden_layersr	  r  s     r'   r!  z,VJEPA2Predictor.__init__.<locals>.<listcomp>b  sV     
 
 
  0144 %)V-JQ-NOO	
 
 
r&   c           	      b    g | ]+}t          |         j        j        j                   ,S r#  )r  rA  pred_num_attention_headspred_mlp_ratior$  s     r'   r!  z,VJEPA2Predictor.__init__.<locals>.<listcomp>k  sW     	 	 	  #21#5 & 7(.(G$3  	 	 	r&   r  Tr   )r.   r@   r9   r*  r?  rj   r&  r[  r   r'  r(  r  rA  r  r)  r   r:   rE   r+  s    `@r'   r@   zVJEPA2Predictor.__init__]  s    &+#3F;;
 
 
 
 6899
 
 
 ]	 	 	 	 	 v<==	 	 	
 

 f&=6CXYYYIf5v7IPTUUU			r&   Nc           	         |                     |j                  }t          j        |d|          }|                     |j                  }|                    d                              dd|                    d                    }t          j        |d|          }||d         |                     |j                  }|                    ddddd          }|                    d                              d                              d|                    d          |                    d          d                              d                              dddd|                    d                    }t          j        |d|          }|                    d                              d                              d                              d|                    d          |                    d          |                    d          d          }t          j        |d|          }|                    ddddd          }|||fS )Nr   r7  r,   r   rQ   r   ra   )rg   r   r!   r9  r   expandr   rd   )r1   r   position_masksargsortr   hidden_states_argsort
argsort_4d
argsort_5ds           r'   sort_tokenszVJEPA2Predictor.sort_tokensy  s   **^233n!7KKK **]122 ' 1 1" 5 5 < <R]EWEWXZE[E[ \ \]AVWWW  Yq\%=jj!122G!))!Q1a88I!!!$$1INN1--y~~a/@/@"EE2BB	r(:(:;;  YAZHHHI!!!$$11INN1--y~~a/@/@)..QRBSBSUWXX	  YAZHHHI!))!Q1a88Ini77r&   c                    |                     |j                  }t          j        |d          }|                    d                              dd|                    d                    }t          j        |d|          }|S )Nr   r   r,   r7  )rg   r   r!   rb  r   r`  r   r9  )r1   r   rb  reverse_argsorts       r'   unsort_tokenszVJEPA2Predictor.unsort_tokens  sz    **]122-Q777)33B77>>r2}GYGYZ\G]G]^^]QQQr&   Fencoder_hidden_statesrL  rM  r   r   r,  rO   c                 \   |rdnd }|rdnd }	t          ||          }|j        \  }
}}|                     |||          \  }}t          j        |d          }|                     ||||          \  }}}t          | j                  D ]=\  }}|r||fz   }|||         nd } |||||          }|d         }|r|	|d         fz   }	>|r||fz   }|                     |          }| 	                    ||          }|d d |d f         }| 
                    |          }t          |||	          S )Nr%   r   r   r   r.  )r=  rc   rj   r!   rb  rf  r/  r(  r)  ri  rE   r   )r1   rj  rL  rM  r   r   r,  r}   r0  r1  r   N_ctxtr   r   ra  rb  r   r2  r3  r4  s                       r'   rU   zVJEPA2Predictor.forward  s    #7@BBD$5?bb4 !,,A< P P,261(,8M|]h(i(i%~ -A666373C3CMSacjlu3v3v0~y(44 		P 		POA|# I$58H$H!.7.CillO(LYjkkM)!,M  P&9]1=M<O&O# 	E 1]4D D}55**='BB%aaaj1		-00++*
 
 
 	
r&   rH   )NFF)r   r   r   r   r@   rf  ri  r   r!   rY   r-   r   r   r   rU   r4   r5   s   @r'   rX  rX  \  s        V| V V V V V V88 8 8 8B    -1"'%*0
 0
$|0
 5<(0
 %,'	0

 EL)0
  0
 #0
 
0
 0
 0
 0
 0
 0
 0
 0
r&   rX  c                        e Zd ZdZdef fdZ	 	 ddej        deej                 dee	         d	e
ej        eej                 f         fd
Z xZS )VJEPA2PoolerSelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr9   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r.   r@   r9   r:   	embed_dimr   r   head_dimr   scaleattention_dropoutrr   r   r   r   k_projv_projq_projout_projrJ  s     r'   r@   z"VJEPA2PoolerSelfAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr&   NFr   rp   r   rO   c           
         |j         \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| |||	|| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS z#Input shape: Batch x Time x Channelr   rQ   r   rk   r   N)rc   rw  ru  rv  r   r   rr  rS   r   r9   r   r   r   rs  rw   rr   r   r|   rx  )r1   r   rp   r   r   r   rq  querieskeysvaluesr   r   r~   s                r'   rU   z!VJEPA2PoolerSelfAttention.forward  s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r&   NFr   r   r   r    r   r@   r!   rY   r   r   r$   rU   r4   r5   s   @r'   rn  rn    s        GGB| B B B B B B. 26,1	') ')|') !.') $D>	')
 
u|Xel33	4') ') ') ') ') ') ') ')r&   rn  c                        e Zd ZdZdef fdZ	 	 ddej        dej        dej        d	eej                 d
ee	         de
ej        eej                 f         fdZ xZS )VJEPA2PoolerCrossAttentionz_It's different from other cross-attention layers, doesn't have output projection layer (o_proj)r9   c                 :   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S rp  )r.   r@   r9   r:   rq  r   r   rr  r   rs  rt  rr   r   r   r   ru  rv  rw  rJ  s     r'   r@   z#VJEPA2PoolerCrossAttention.__init__  s    +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??r&   NFr{  r|  r}  rp   r   rO   c           
         |j         \  }}}|j         d         }	|                     |          }|                     |          }|                     |          }|                    ||| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }t          }
| j	        j
        dk    rt          | j	        j
                 }
 |
| ||||| j        | j        | j        sdn| j                  \  }}|                    |||                                          }|sd}||fS rz  )rc   rw  ru  rv  r   r   rr  rS   r   r9   r   r   r   rs  rw   rr   r   r|   )r1   r{  r|  r}  rp   r   r   q_seq_lengthrq  kv_seq_lengthr   r   r~   s                r'   rU   z"VJEPA2PoolerCrossAttention.forward.  s    /6m+
L)
1++g&&{{4  V$$,,z<WWaabcefggyy]DNDMRR\\]^`abbZVV``abdeff(?;+w66"9$+:Z"[$7$7nJ#}>CC$,	%
 	%
 	%
!\ "))*lINNYY[[  	 LL((r&   r~  r  r5   s   @r'   r  r    s        ii@| @ @ @ @ @ @0 26,1)) )))) l)) 	))
 !.)) $D>)) 
u|Xel33	4)) )) )) )) )) )) )) ))r&   r  c                   z     e Zd Zdef fdZ	 d
dej        dej        dee         de	ej        df         fd	Z
 xZS )VJEPA2PoolerSelfAttentionLayerr9   c                 :   t                                                       t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _	        t          ||j                  | _        d S Nr  r^   )r.   r@   r   r  r:   r  layer_norm1rn  	self_attnlayer_norm2r   r  rJ  s     r'   r@   z'VJEPA2PoolerSelfAttentionLayer.__init__\  s}    <(:@UVVV26::<(:@UVVVV1CDDDr&   Fr   rp   r   rO   .c                     |}|                      |          }|                     |||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   rp   r   )r  r  r  r  )r1   r   rp   r   r  r~   r   s          r'   rU   z&VJEPA2PoolerSelfAttentionLayer.forwardc  s      !((77&*nn')/ '5 '
 '
#|
 !=0 ((77// =0 " 	'&Gr&   )Fr   r   r   r   r@   r!   rY   r   r   r$   rU   r4   r5   s   @r'   r  r  [  s        E| E E E E E E -2	# #|# # $D>	#
 
u|S 	!# # # # # # # #r&   r  c                        e Zd Zdef fdZ	 	 ddej        dej        deej                 ded	e	ej        d
f         f
dZ
 xZS )VJEPA2PoolerCrossAttentionLayerr9   c                 :   t                                                       t          j        |j        |j                  | _        t          |          | _        t          j        |j        |j                  | _	        t          ||j                  | _        d S r  )r.   r@   r   r  r:   r  r  r  
cross_attnr  r   r  rJ  s     r'   r@   z(VJEPA2PoolerCrossAttentionLayer.__init__  s}    <(:@UVVV4V<<<(:@UVVVV1CDDDr&   NFr{  r  rp   r   rO   .c                    |}|                      |          }|                     |||||          ^}}||z   }|}|                     |          }|                     |          }||z   }|f}|r|t	          |          z  }|S )N)rp   r   )r  r  r  r  r$   )r1   r{  r  rp   r   r  r~   r   s           r'   rU   z'VJEPA2PoolerCrossAttentionLayer.forward  s     ''55&*oo)/ '6 '
 '
#|  ,.  ''55xx--,./ 	+u\***Gr&   r~  r  r5   s   @r'   r  r    s        E| E E E E E E 26"'  l !.	
   
u|S 	!       r&   r  c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )VJEPA2AttentivePoolerzAttentive Poolerr9   c                 D   t                                                       t          j        t	          j        ddj                            | _        t                    | _	        t          j
        fdt          j                  D                       | _        d S )Nr   c                 .    g | ]}t                    S r%   )r  )r  r   r9   s     r'   r!  z2VJEPA2AttentivePooler.__init__.<locals>.<listcomp>  s"    ]]]+F33]]]r&   )r.   r@   r   rG  r!   rH  r:   query_tokensr  cross_attention_layerr'  r&  num_pooler_layersself_attention_layersrJ  s    `r'   r@   zVJEPA2AttentivePooler.__init__  s    LQ6;M)N)NOO%DV%L%L"%']]]]]U6C[=\=\]]]&
 &
"""r&   r  rO   c                     | j         D ]} ||d           d         }| j                            |j        d         dd          }|                     ||          d         }|                    d          S )N)rp   r   r   )r  r  re   rc   r  r   )r1   r  r(  r{  s       r'   rU   zVJEPA2AttentivePooler.forward  s    / 	G 	GE 5dCCCAFLL#**<+=a+@!QGG11'<HHK##A&&&r&   )
r   r   r   r    r   r@   r!   rY   rU   r4   r5   s   @r'   r  r    sp        
| 
 
 
 
 
 
'EL 'U\ ' ' ' ' ' ' ' 'r&   r  c                   <    e Zd ZU eed<   dZdZdZg dZdZ	dZ
d ZdS )VJEPA2PreTrainedModelr9   vjepa2rN   T)r  r  r  r?  c                    | j         j        }d }t          |t                    r ||j        |           t          |j        d          D ]E\  }}||dz  z  } ||j        j        j	        |            ||j
        j        j	        |           F|t          |j                  dz   dz  z  } ||j        j
        j        j	        |           dS t          |t                    r;|j        r |j        j                                         dS  ||j        |           dS t          |t&          j        t&          j        t&          j        f          r; ||j	        |           |j         |j        j                                         dS dS t          |t&          j                  r?|j        j                                         |j	        j                            d           dS dS )zInitialize the weightsc                     | j                             t          j                  }t          j                            |d|          }|                    | j                  | _         d S )Nrk   )meanstd)datarg   r!   r{   r   inittrunc_normal_rb   )rf   r  data_float_32	data_inits       r'   trunc_normal_f32_z>VJEPA2PreTrainedModel._init_weights.<locals>.trunc_normal_f32_  sL    "KNN5=99M--m#3-OOI#,,v|44FKKKr&   )r  r   g      ?Nr   )r9   initializer_ranger0   r  r  r/  r  r  rx  rf   r  r  rQ  r  r?  rE  rI  r  zero_r   r   Conv2drC   r   r  fill_)r1   rl   init_stdr  r   r(  r  s          r'   _init_weightsz#VJEPA2PreTrainedModel._init_weights  s    ;0	5 	5 	5
 f344 	*f1x@@@@%f&BAFF A A5!S&)!!%/":"AsKKKK!!%)-"6C@@@@@c&">??!CKKCf:>BIsSSSSSS 9:: 	*+ D"'--/////!!&"4(CCCCCCBIry ABB 	*fm::::{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r&   N)r   r   r   r   r#   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attnr  r%   r&   r'   r  r    sa          +O&*#   N* * * * *r&   r  c                     | B|                      d                               d          } |                     |dddd          } ndg|z  } | S )z
    Inputs:
        - head_mask: bsz x seq_length x seq_length | None
    Returns
        - [num_hidden_layers x batch x num_heads x seq_length x seq_length] | [num_hidden_layers]
    Nr   r   r,   )r   r`  )r   r  s     r'   _convert_head_mask_to_5dr    s\     ''**44Q77	$$%6BBGG		F..	r&   c                   4    e Zd Zdef fdZdefdZee	 	 	 	 	 	 	 dde	j
        dee	j
                 d	eee	j
                          d
ee	j
                 deee	j
                          dedee         dee         defd                        Zde	j
        fdZ xZS )VJEPA2Modelr9   c                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S rH   )r.   r@   r9   r  encoderrX  	predictor	post_initrJ  s     r'   r@   zVJEPA2Model.__init__  sX       $V,,(00 	r&   rO   c                 $    | j         j        j        S rH   )r  rj   r_   r   s    r'   get_input_embeddingsz VJEPA2Model.get_input_embeddings  s    |&77r&   NFrN   context_head_maskrL  target_head_maskrM  skip_predictorr   r,  c	                 t   ||n| j         j        }||n| j         j        }|t          d          t	          || j         j                  }t	          || j         j                  }|                     ||||          }
|
j        }|||	                    d          }|	                    d          }t          j        ||j                                      d                              |df          g}t          j        ||j                                      d                              |df          g}|sL|                     ||||||          }t!          |j        t#          ||          |j        |j                  }nd}t)          |t#          ||          |
j        |
j        |	          }|S )
aL  
        context_head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
            The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard) for the context.
        context_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be exposed to the predictor.
            By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating full context
            available to the predictor.
        target_head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
            The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard) for the target.
        target_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be used as a prediction target
            for the predictor. By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating
            that the predictor should predict all encoder patches.
        skip_predictor (bool):
            flag to skip the predictor forward, useful if you just need the encoder outputs
        Nz'You have to specify pixel_values_videos)rN   r   r   r,  r   r   r   )rj  rL  rM  r   r   r,  )r   r   r   r   )r   r   r   r   r*   )r9   r   r,  r   r  r  r[  r  r   r   r!   r   r   r   re   r  r   r=  r   r   r)   )r1   rN   r  rL  r  rM  r  r   r,  r}   encoder_outputssequence_outputr   r   predictor_outputsr*   encoder_outputs                    r'   rU   zVJEPA2Model.forward  s   < 2C1N--TXT_Tq$8$D  $+Jj 	 &FGGG 55FHeff34DdkFhii+/<< 3'/!5	 ,8 ,
 ,
 *;K$7#((++A$$Q''A!L3F3MNNNXXYZ[[bbdeghcijjkL <2E2LMMMWWXYZZaacdfgbhiijK 	$15&5)'*"3%9 2@ 2 2  D"3"E$/$M$M/=,7	       $9- +O\ J J)7&1-
 
 
 r&   c                 >    |                      |d          }|j        S )NT)r  )rU   r   )r1   rN   r  s      r'   get_vision_featureszVJEPA2Model.get_vision_featuresg  s!    &9$OO//r&   )NNNNFNN)r   r   r   r   r@   r7   r  r   r   r!   rY   r   r-   r   r)   rU   r  r4   r5   s   @r'   r  r    s[       |      8&= 8 8 8 8  59593748$,0/3P P"\P $EL1P tEL12	P
 #5<0P d5<01P P $D>P 'tnP 
*P P P ^ Pd0%, 0 0 0 0 0 0 0 0r&   r  z}
    V-JEPA 2 Model transformer with a video classification head on top (a linear layer on top of the attentive pooler).
    c                        e Zd Zdef fdZee	 	 	 d
dej        de	ej                 de	e
         de	e
         deeef         f
d	                        Z xZS )VJEPA2ForVideoClassificationr9   c                 &   t                                          |           |j        | _        t          |          | _        t          |          | _        t          j        |j	        |j        d          | _
        |                                  d S r   )r.   r@   
num_labelsr  r  r  poolerr   r   r:   
classifierr  rJ  s     r'   r@   z%VJEPA2ForVideoClassification.__init__r  s|        +!&)) ,F33)F$68IPTUUU 	r&   NrN   labelsr   r,  rO   c                    |                      |d||          }|j        }|                     |          }|                     |          }d}	||                     ||| j                  }	t          |	||j        |j                  S )ag  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> import numpy as np
        >>> from transformers import AutoVideoProcessor, VJEPA2ForVideoClassification

        >>> device = "cuda"

        >>> video_processor = AutoVideoProcessor.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2")
        >>> model = VJEPA2ForVideoClassification.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2").to(device)

        >>> video = np.ones((64, 256, 256, 3))  # 64 frames, 256x256 RGB
        >>> inputs = video_processor(video, return_tensors="pt").to(device)

        >>> # For inference
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> logits = outputs.logits

        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])

        >>> # For training
        >>> labels = torch.ones(1, dtype=torch.long, device=device)
        >>> loss = model(**inputs, labels=labels).loss

        ```T)rN   r  r   r,  N)pooled_logitsr  r9   )losslogitsr   r   )	r  r   r  r  loss_functionr9   r   r   r   )
r1   rN   r  r   r,  r   r   pooler_outputr  r  s
             r'   rU   z$VJEPA2ForVideoClassification.forward  s    X ++ 3/!5	  
 
 $5$566//%%F6RVR]%^^D$!/)	
 
 
 	
r&   )NNN)r   r   r   r   r@   r   r   r!   rY   r   r   r   r$   r   rU   r4   r5   s   @r'   r  r  l  s        |        *.,0/3>
 >
"\>
 &>
 $D>	>

 'tn>
 
u++	,>
 >
 >
 ^ >
 >
 >
 >
 >
r&   r  )r  r  r  )rk   )rk   F):dataclassesr   typingr   r   r   r!   r   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   configuration_vjepa2r   
get_loggerr   loggerr   r)   Moduler7   r[   rY   r   r   r   r   r   r   r   r   r  r  r-   r=  r?  rX  rn  r  r  r  r  r  r  r  r  __all__r%   r&   r'   <module>r     s   " ! ! ! ! ! , , , , , , , , , ,        ! ! ! ! ! ! 9 9 9 9 9 9 F F F F F F F F F F F F F F F F K K K K K K K K K K K K . . . . . . 
	H	%	%   
< < < < <; < <  <       {    *         bi      F    ry   T % %I%<% 
% <	%
 U\*% % % % % %<) ) )6I I I I I") I I IZ U\ e T V[Vb    *% % % % %RY % % %    	    4 4 4 4 4, 4 4 4n;
 ;
 ;
 ;
 ;
BI ;
 ;
 ;
|0 0T%,-? 0EL 0 0 0 0"C! C! C! C! C!	 C! C! C!Lv
 v
 v
 v
 v
bi v
 v
 v
r>) >) >) >) >)	 >) >) >)BA) A) A) A) A) A) A) A)J+ + + + +%? + + +\% % % % %&@ % % %P' ' ' ' 'BI ' ' '& -* -* -* -* -*O -* -* -*`   d0 d0 d0 d0 d0' d0 d0 d0N   
N
 N
 N
 N
 N
#8 N
 N
 
N
b S
R
Rr&   