
     `iە                     x   d Z ddlZddlmZ ddlmZ ddlmZm	Z	 ddl
ZddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+  e#j,        e-          Z.e e"d           G d de                                   Z/e e"d           G d de                                   Z0d Z1 G d dej2                  Z3 G d dej2                  Z4	 dDd!ej2        d"ej5        d#ej5        d$ej5        d%e	ej5                 d&e6d'e6fd(Z7 G d) d*ej2                  Z8 G d+ d,ej2                  Z9 G d- d.ej2                  Z: G d/ d0ej2                  Z; G d1 d2ej2                  Z< G d3 d4e          Z= G d5 d6ej2                  Z>e" G d7 d8e                      Z?e" G d9 d:e?                      Z@ G d; d<ej2                  ZA e"d=           G d> d?e?                      ZB e"d@           G dA dBe?                      ZCg dCZDdS )Ez,PyTorch VideoMAE (masked autoencoder) model.    N)deepcopy)	dataclass)CallableOptional)nn)MSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)can_return_tuplecheck_model_inputs   )VideoMAEConfigz[
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZeej                 ed<   dZ	ee
ej                          ed<   dZee
ej                          ed<   dS )VideoMAEDecoderOutputz
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r    r   torchFloatTensor__annotations__r!   tupler"        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/videomae/modeling_videomae.pyr   r   *   sp          
 +/FHU&'...8<M8E%"345<<<59Ju01299999r,   r   zb
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
    c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )VideoMAEForPreTrainingOutputz
    loss (`torch.FloatTensor` of shape `(1,)`):
        Pixel reconstruction loss.
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    Nlossr    r!   r"   )r#   r$   r%   r&   r0   r   r'   r(   r)   r    r!   r*   r"   r+   r,   r-   r/   r/   ;   s           )-D(5$
%,,,*.FHU&'...8<M8E%"345<<<59Ju01299999r,   r/   c                 d   fdt          j        fdt          |           D                       }t          j        |dddddf                   |dddddf<   t          j        |dddddf                   |dddddf<   t          j        |                              d          S )z Sinusoid position encoding tablec                 >      fdt                    D             S )Nc           	      R    g | ]#}t          j        d d|dz  z  z            z  $S )i'     )nppower).0hid_jd_hidpositions     r-   
<listcomp>zOget_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>V   s8    ___28E1
+;e+CDDD___r,   )range)r:   r9   s   `r-   get_position_angle_vecz;get_sinusoid_encoding_table.<locals>.get_position_angle_vecU   s+    _____RWX]R^R^____r,   c                 &    g | ]} |          S r+   r+   )r7   pos_ir=   s     r-   r;   z/get_sinusoid_encoding_table.<locals>.<listcomp>X   s%    \\\55e<<\\\r,   Nr   r4   r   )r5   arrayr<   sincosr'   r(   	unsqueeze)
n_positionr9   sinusoid_tabler=   s    ` @r-   get_sinusoid_encoding_tablerF   Q   s    ` ` ` ` ` X\\\\%PZJ[J[\\\]]N f^AAAqt!tG%<==N111add7 f^AAAqt!tG%<==N111add7^,,66q999r,   c                   (     e Zd ZdZ fdZd Z xZS )VideoMAEEmbeddingsz7
    Construct the patch and position embeddings.

    c                     t                                                       t          |          | _        | j        j        | _        t          | j        |j                  | _        || _        d S N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesrF   hidden_sizeposition_embeddingsconfigselfrR   	__class__s     r-   rL   zVideoMAEEmbeddings.__init__e   s[     7 ? ?0<#>t?OQWQc#d#d r,   c                    |                      |          }|| j                                                            |                              |j        d          z   }|+|j        \  }}}||          }|                    |d|          }|S )NTdevicecopy)rN   rQ   detachtype_astorX   shapereshape)rT   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelss          r-   forwardzVideoMAEEmbeddings.forwardn   s    **<88
  $":"A"A"C"C"K"KJ"W"W"Z"Z$4 #[ #
 #
 


 &*4*:'J<#_$45J#++JLIIJr,   r#   r$   r%   r&   rL   rf   __classcell__rU   s   @r-   rH   rH   _   sQ         
          r,   rH   c                   (     e Zd ZdZ fdZd Z xZS )rM   aw  
    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
    patch_size).

    c           	      v   t                                                       |j        }|j        }|j        }|j        }|j        }|j        }t          |t          j
        j                  r|n||f}t          |t          j
        j                  r|n||f}|| _        || _        t          |          | _        |d         |d         z  |d         |d         z  z  || j        z  z  }|| _        || _        t          j        ||| j        |d         |d         f| j        |d         |d         f          | _        d S )Nr   r   )in_channelsout_channelskernel_sizestride)rK   rL   
image_size
patch_sizere   rP   
num_framestubelet_size
isinstancecollectionsabcIterableintrO   r   Conv3d
projection)
rT   rR   rp   rq   re   rP   rr   rs   rO   rU   s
            r-   rL   z VideoMAEPatchEmbeddings.__init__   sC   &
&
*(&
*#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
$$--]jm+
1A0NOS]aearSrs 	 )&)$$*JqM:a=I%z!}jmD	
 
 
r,   c                    |j         \  }}}}}|| j        k    rt          d          || j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                    dddd	d
          }|                     |                              d                              dd          }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r4   r	      )r^   re   
ValueErrorrp   permuterz   flatten	transpose)rT   r`   rc   rr   re   heightwidthrb   s           r-   rf   zVideoMAEPatchEmbeddings.forward   s    >J>P;
Jfe4,,,w   T_Q'''5DOA4F+F+FwVwwewwDO\]L^wwaeapqraswww   $++Aq!Q::__\22::1==GG1MM
r,   rg   ri   s   @r-   rM   rM      sQ         
 
 
 
 
6      r,   rM           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }|||z  }t          j        ||          }	|	                    dd                                          }	|	|fS )NrZ   )dimdtype)ptrainingr   r4   )r'   matmulr   r   
functionalsoftmaxfloat32r]   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r-   eager_attention_forwardr      s     <s}}R'<'<==GL =((2U](SSVVW\WbccL =((6?([[L !#n4,|U33K''1--88::K$$r,   c                   t     e Zd Zdeddf fdZddeej                 deej        ej        f         fdZ	 xZ
S )VideoMAESelfAttentionrR   returnNc                 t   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          || _        |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        |j
        | _        | j        dz  | _        d| _        t          j        |j        | j	        d          | _        t          j        |j        | j	        d          | _        t          j        |j        | j	        d          | _        |j        rbt          j        t+          j        | j	                            | _        t          j        t+          j        | j	                            | _        d S d | _        d | _        d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      Fbias)rK   rL   rP   num_attention_headshasattrr~   rR   rx   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearr   r   r   qkv_bias	Parameterr'   zerosq_biasv_biasrS   s     r-   rL   zVideoMAESelfAttention.__init__   s    ::a??PVXhHiHi?76#5 7 737 7 7   #)#= #&v'9F<V'V#W#W !58PP"?/5Yv143EERRR
9V/1C%PPPYv143EERRR
? 	,u{43E'F'FGGDK,u{43E'F'FGGDKKKDKDKKKr,   	head_maskc           
         |j         \  }}}| j        t          j        | j        d          nd }t
          j                            || j        j	        |          }t
          j                            || j
        j	        | j                  }t
          j                            || j        j	        | j                  }	|                    |d| j        | j                                      dd          }
|                    |d| j        | j                                      dd          }|	                    |d| j        | j                                      dd          }t           }| j        j        dk    rt&          | j        j                 } || ||
||| j        | j        | j        sdn| j        	          \  }}|                                d d
         | j        fz   }|                    |          }||fS )NF)requires_grad)inputweightr   rZ   r   r4   eagerr   )r   r   r   r   )r^   r   r'   
zeros_liker   r   r   linearr   r   r   r   viewr   r   r   r   rR   _attn_implementationr   r   r   r   r   sizer   r_   )rT   r!   r   rc   
seq_lengthrd   k_biaskeysvaluesqueries	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes                    r-   rf   zVideoMAESelfAttention.forward   s   $1$7!
JGK{G^!$+UCCCCdh}##-V\#]]%%M$*BSZ^Ze%ff-&&]4:CT[_[f&ggIIj"d.FH`aakklmopqq	kk*b$2JDLdeeoopqstuull:r43KTMeffppqrtuvv(?;+w66"9$+:Z"[)<)<nL#}CCC$2C	*
 	*
 	*
& #0"4"4"6"6ss";t?Q>S"S%--.EFFo--r,   rJ   )r#   r$   r%   r   rL   r   r'   Tensorr*   rf   rh   ri   s   @r-   r   r      s        ~ $      4. .0F .RWX]XdfkfrXrRs . . . . . . . .r,   r   c                   Z     e Zd ZdZdef fdZdej        dej        dej        fdZ xZ	S )VideoMAESelfOutputz
    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rR   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S rJ   )	rK   rL   r   r   rP   denseDropouthidden_dropout_probr   rS   s     r-   rL   zVideoMAESelfOutput.__init__  sJ    Yv163EFF
z&"<==r,   r!   input_tensorr   c                 Z    |                      |          }|                     |          }|S rJ   r   r   rT   r!   r   s      r-   rf   zVideoMAESelfOutput.forward  s*    

=11]33r,   )
r#   r$   r%   r&   r   rL   r'   r   rf   rh   ri   s   @r-   r   r     s         
>~ > > > > > >
U\  RWR^        r,   r   c                   |     e Zd Zdef fdZdee         fdZd
dej	        de
ej	                 dej	        fd	Z xZS )VideoMAEAttentionrR   c                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S rJ   )rK   rL   r   	attentionr   outputsetpruned_headsrS   s     r-   rL   zVideoMAEAttention.__init__$  sI    .v66(00EEr,   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rT   r   indexs      r-   prune_headszVideoMAEAttention.prune_heads*  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r,   Nr!   r   r   c                 d    |                      ||          \  }}|                     ||          }|S rJ   )r   r   )rT   r!   r   self_attn_outputrd   r   s         r-   rf   zVideoMAEAttention.forward<  s4    "nn]IFF!-}==r,   rJ   )r#   r$   r%   r   rL   r   rx   r   r'   r   r   rf   rh   ri   s   @r-   r   r   #  s        "~ " " " " " ";S ; ; ; ;$ U\ hu|>T `e`l        r,   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )VideoMAEIntermediaterR   c                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rJ   )rK   rL   r   r   rP   intermediate_sizer   rt   
hidden_actstrr
   intermediate_act_fnrS   s     r-   rL   zVideoMAEIntermediate.__init__D  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r,   r!   r   c                 Z    |                      |          }|                     |          }|S rJ   )r   r   )rT   r!   s     r-   rf   zVideoMAEIntermediate.forwardL  s,    

=1100??r,   	r#   r$   r%   r   rL   r'   r   rf   rh   ri   s   @r-   r   r   C  sj        9~ 9 9 9 9 9 9U\ el        r,   r   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )VideoMAEOutputrR   c                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S rJ   )
rK   rL   r   r   r   rP   r   r   r   r   rS   s     r-   rL   zVideoMAEOutput.__init__T  sJ    Yv79KLL
z&"<==r,   r!   r   r   c                 d    |                      |          }|                     |          }||z   }|S rJ   r   r   s      r-   rf   zVideoMAEOutput.forwardY  s4    

=11]33%4r,   r   ri   s   @r-   r   r   S  su        >~ > > > > > >
U\  RWR^        r,   r   c                   h     e Zd ZdZdef fdZd	dej        deej                 dej        fdZ	 xZ
S )
VideoMAELayerz?This corresponds to the Block class in the timm implementation.rR   c                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)rK   rL   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormrP   layer_norm_epslayernorm_beforelayernorm_afterrS   s     r-   rL   zVideoMAELayer.__init__d  s    '-'E$*622088$V,, "V-?VEZ [ [ [!|F,>FDYZZZr,   Nr!   r   r   c                     |                      |          }|                     ||          }||z   }|                     |          }|                     |          }|                     ||          }|S rJ   )r   r   r   r   r   )rT   r!   r   hidden_states_normattention_outputlayer_outputs         r-   rf   zVideoMAELayer.forwardn  sz    !22=AA>>*<iHH )=8 ++M::((66 {{<??r,   rJ   )r#   r$   r%   r&   r   rL   r'   r   r   rf   rh   ri   s   @r-   r   r   a  s        II[~ [ [ [ [ [ [ U\ hu|>T `e`l        r,   r   c                   Z     e Zd Zdef fdZddej        deej                 defdZ	 xZ
S )	VideoMAEEncoderrR   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r+   r   )r7   rd   rR   s     r-   r;   z,VideoMAEEncoder.__init__.<locals>.<listcomp>  s!    #c#c#caM&$9$9#c#c#cr,   F)	rK   rL   rR   r   
ModuleListr<   num_hidden_layerslayergradient_checkpointingrS   s    `r-   rL   zVideoMAEEncoder.__init__  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###r,   Nr!   r   r   c                     t          | j                  D ]\  }}|||         nd } |||          }t          |          S )Nlast_hidden_state)	enumerater  r   )rT   r!   r   ilayer_modulelayer_head_masks         r-   rf   zVideoMAEEncoder.forward  sW    (44 	I 	IOA|.7.CillO(LHHMM????r,   rJ   )r#   r$   r%   r   rL   r'   r   r   r   rf   rh   ri   s   @r-   r   r     s        ,~ , , , , , ,@ @U\ @hu|>T @`o @ @ @ @ @ @ @ @r,   r   c                   N    e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdZeedZd Zd	S )
VideoMAEPreTrainedModelrR   videomaer`   TrH   r   )r!   r"   c                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNg      ?)rt   r   r   ry   r   datanormal_rR   initializer_ranger   zero_r   fill_)rT   r   s     r-   _init_weightsz%VideoMAEPreTrainedModel._init_weights  s    fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r,   N)r#   r$   r%   r   r)   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr  r+   r,   r-   r  r    sw         "$O&*#-?N"&&+ 

* 
* 
* 
* 
*r,   r  c                        e Zd Z fdZd Zd Z ed          e	 	 ddej	        de
ej                 d	e
ej                 d
ee         def
d                        Z xZS )VideoMAEModelc                 8   t                                          |           || _        t          |          | _        t          |          | _        |j        rd | _        n%t          j
        |j        |j                  | _        |                                  d S )Nr   )rK   rL   rR   rH   rb   r   encoderuse_mean_pooling	layernormr   r   rP   r   	post_initrS   s     r-   rL   zVideoMAEModel.__init__  s       ,V44&v.." 	Y!DNN\&*<&BWXXXDN 	r,   c                     | j         j        S rJ   )rb   rN   )rT   s    r-   get_input_embeddingsz"VideoMAEModel.get_input_embeddings  s    //r,   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr'  r  r   r   )rT   heads_to_pruner  r   s       r-   _prune_headszVideoMAEModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr,   F)tie_last_hidden_statesNr`   ra   r   r   r   c                    |                      || j        j                  }|                     ||          }|                     ||          }|j        }| j        |                     |          }t          |          S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
            length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

        Examples:

        ```python
        >>> import av
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEModel
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

        >>> # prepare video for the model
        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1568, 768]
        ```r   Nr	  )get_head_maskrR   r  rb   r'  r
  r)  r   )rT   r`   ra   r   r   embedding_outputencoder_outputssequence_outputs           r-   rf   zVideoMAEModel.forward  s|    | &&y$+2OPP	??<II+/<<8HT]<+^+^);>%"nn_==OAAAAr,   )NN)r#   r$   r%   rL   r,  r0  r   r   r'   r(   r   
BoolTensorr   r   r   r   rf   rh   ri   s   @r-   r%  r%    s           0 0 0C C C u555 7;,0	eB eB'eB "%"23eB EL)	eB
 +,eB 
eB eB eB ^ 65eB eB eB eB eBr,   r%  c                   >     e Zd Zdef fdZdej        defdZ xZ	S )VideoMAEDecoderrR   c                 2   t                                                       |j        |j        z  |j        dz  z  }t          |          |j        _        |j        _	        |j
        _        |j        _        t          j        fdt!          |j                  D                       | _        t          j        |j                  | _        |dk    rt          j        |j        |          nt          j                    | _        d| _        | _        d S )Nr4   c                 .    g | ]}t                    S r+   r  )r7   rd   decoder_configs     r-   r;   z,VideoMAEDecoder.__init__.<locals>.<listcomp>>  s!    \\\q]>**\\\r,   r   F)rK   rL   re   rs   rq   r   decoder_hidden_sizerP   decoder_num_hidden_layersr  decoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r<   decoder_layersr   normr   Identityheadr  rR   )rT   rR   decoder_num_labelsr=  rU   s      @r-   rL   zVideoMAEDecoder.__init__3  s   #063FFIZ\]I]]!&))%+%?"+1+K(-3-O*+1+K( m\\\\E&:Z4[4[\\\
 
 L!;<<	I[^_I_I_BIf02DEEEegeperer 		 ',#$r,   r!   return_token_numc                     | j         D ]} ||d           }|dk    r|d d | d f         }|                     |          }|                     |          }t          |          S )Nr3  r   )r    )rB  rC  rE  r   )rT   r!   rG  r  r    s        r-   rf   zVideoMAEDecoder.forwardI  s     / 	H 	HL(L$GGGMMa)!!!.>->-?-?*?@M 		-00=))$F3333r,   )
r#   r$   r%   r   rL   r'   r   rx   rf   rh   ri   s   @r-   r:  r:  2  sh        %~ % % % % % %,4U\ 4S 4 4 4 4 4 4 4 4r,   r:  zb
    The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.
    c                        e Zd Z fdZee	 d	dej        dej        de	ej
                 dee         def
d                        Z xZS )
VideoMAEForPreTrainingc                    t                                          |           || _        t          |          | _        t          j        |j        |j        d          | _	        t          j
        t          j        dd|j                            | _        t          | j        j        j        |j                  | _        t%          |          | _        |                                  d S )NFr   r   )rK   rL   rR   r%  r  r   r   rP   r>  encoder_to_decoderr   r'   r   
mask_tokenrF   rb   rO   rQ   r:  decoderr*  rS   s     r-   rL   zVideoMAEForPreTraining.__init__^  s       %f--"$)F,>@Zaf"g"g"g,u{1a9S'T'TUU#>M$0&2L$
 $
  'v.. 	r,   Nr`   ra   r   r   r   c                 D    | j         |f||d|}|j        }|                     |          }|j        \  }}}	|t	          d          | j                            |dd                              |          }
|
                                	                    |j
        d          }
|
|                              |d|	          }|
|                             |d|	          }t          j        ||z   | j        |z   gd          }|                     ||j        d                   }|j        }d}t          j                    5  | j        j        d	k    r|}n|j
        }|j        }t          j        t,                    	                    ||
          ddddddf         }t          j        t.                    	                    ||
          ddddddf         }||z  |z   }|j        \  }}}	}}| j        j        | j        j        }}| j        j        r|                    |||z  ||	||z  |||z  |          }|                    dddddddd	                                          }|                    |||z  |z  |z  |z  |z  ||z  |z  |	          }||                    dd          z
  |                    ddd                                           dz   z  }|                    |||z  |z  |z  |z  |z  ||z  |z  |	z            }n| j        j        d	k    rt	          d          |                    |||z  ||	||z  |||z  |          }|                    dddddddd	                                          }|                    |||z  |z  |z  |z  |z  ||z  |z  |	z            }|j        \  }}}	||                             |d|	          }ddd           n# 1 swxY w Y   tC                      } |||          }tE          |||j#        |j$                  S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
            (image_size // patch_size) ** 2`.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 16
        >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

        >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

        >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
        >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
        >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss = outputs.loss
        ```)ra   r   Nz!One must provided a boolean mask rZ   TrW   r   r   r	   )rX   r   r   r}      r4         r   )r   keepdim)r   unbiasedrS  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r0   r    r!   r"   )%r  r
  rL  r^   r~   rQ   expandr\   r[   r]   rX   r_   r'   catrM  rN  r    no_gradrR   re   r   	as_tensorr   r   rs   rq   norm_pix_lossr   r   r   r  varsqrtr   r/   r!   r"   )rT   r`   ra   r   r   outputsr7  rc   rd   re   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr    r0   framesrX   r   r  r  timer   r   rs   rq   frames_normvideos_patchlabelsloss_fcts                                  r-   rf   zVideoMAEForPreTraining.forwardo  sN   H $14=$
*9Y$
 $
RX$
 $
 "311/BB '6&;#
A| "@AAA'+'?'F'FzSUWY'Z'Z'b'bco'p'p$'C'J'J'L'L'O'OWcWjqu'O'v'v$67GHPPQ[]_amnn3ODLLZY[]ijj Oo=tQ]?]^defff 26flFXYZF[1\1\ ']__ H	Y H	Y{'1,,% &,$*'<==@@V[@\\]acgijijijlprv]vwo&:;;>>fTY>ZZ[_aeghghghjnpt[tu%+d2<BL9JlFE'+{'?AW*L{( 6L(  j(Z'	 	  1aAq!Q??JJLLL(61Z?%G:U :-
: 	   &D(I(IIJJ2dJCCHHJJTQ  +//L(61Z?%G:U :-
:\I    ;+q00$k    L(  j(Z'	 	  1aAq!Q??JJLL%{{L(61Z?%G:U :-
:\I    +7*<'J<!/2:::r<XXFQH	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	YT 99x''+!/)	
 
 
 	
s   JO  O$'O$rJ   )r#   r$   r%   rL   r   r   r'   r(   r8  r   r   r   r   r/   rf   rh   ri   s   @r-   rJ  rJ  X  s            " 
 -1	O
 O
'O
 )O
 EL)	O

 +,O
 
&O
 O
 O
 ^ O
 O
 O
 O
 O
r,   rJ  z
    VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.
    c                        e Zd Z fdZee	 	 	 d	deej                 deej                 deej                 de	e
         def
d                        Z xZS )
VideoMAEForVideoClassificationc                    t                                          |           |j        | _        t          |          | _        |j        rt          j        |j                  nd | _	        |j        dk    rt          j
        |j        |j                  nt          j                    | _        |                                  d S )Nr   )rK   rL   
num_labelsr%  r  r(  r   r   rP   fc_normr   rD  
classifierr*  rS   s     r-   rL   z'VideoMAEForVideoClassification.__init__
  s        +%f-- <B;R\r|F$6777X\NTN_bcNcNc")F$68IJJJikitiviv 	r,   Nr`   r   rg  r   r   c                 B    | j         |fd|i|}|j        }| j        +|                    d          }|                     |          }n|dddf         }|                     |          }d}	| | j        ||| j        fi |}	t          |	||j        |j	                  S )a!  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import av
        >>> import torch
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     logits = outputs.logits

        >>> # model predicts one of the 400 Kinetics-400 classes
        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])
        eating spaghetti
        ```r   Nr   r   rU  )
r  r
  rm  r  rn  loss_functionrR   r   r!   r"   )
rT   r`   r   rg  r   r]  r7  r   r    r0   s
             r-   rf   z&VideoMAEForVideoClassification.forward  s    x $14=#]#]#]V\#]#]!3<#$))!,,F\\&))FF$QQQT*F((%4%ffdkLLVLLD$!/)	
 
 
 	
r,   )NNN)r#   r$   r%   rL   r   r   r   r'   r   r   r   r   rf   rh   ri   s   @r-   rj  rj    s              04,0)-	n
 n
u|,n
 EL)n
 &	n

 +,n
 
n
 n
 n
 ^ n
 n
 n
 n
 n
r,   rj  )rJ  r%  r  rj  )r   )Er&   collections.abcru   rY   r   dataclassesr   typingr   r   numpyr5   r'   r   torch.nnr   activationsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.constantsr   r   utils.genericr   r   configuration_videomaer   
get_loggerr#   loggerr   r/   rF   ModulerH   rM   r   floatr   r   r   r   r   r   r   r   r  r%  r:  rJ  rj  __all__r+   r,   r-   <module>r     s   3 2           ! ! ! ! ! ! % % % % % % % %                  ! ! ! ! ! ! 9 9 9 9 9 9 F F F F F F F F F F F F F F F F & & & & & & Q Q Q Q Q Q Q Q M M M M M M M M M M M M J J J J J J J J A A A A A A A A 2 2 2 2 2 2 
	H	%	%   
: : : : :K : :  :   
: : : : :; : :  : : : :       B2 2 2 2 2bi 2 2 2z % %I%<% 
% <	%
 U\*% % % % % %<9. 9. 9. 9. 9.BI 9. 9. 9.z       $    	   @    29    
 
 
 
 
RY 
 
 
    .   >@ @ @ @ @bi @ @ @ * * * * *o * * *8 BB BB BB BB BB+ BB BB BBJ#4 #4 #4 #4 #4bi #4 #4 #4L   
c
 c
 c
 c
 c
4 c
 c
 
c
L   ~
 ~
 ~
 ~
 ~
%< ~
 ~
 ~
B s
r
rr,   