
    Pi*                     v    d dl mZmZ d dlZd dlmZ  G d dej                  Z G d dej                  ZdS )    )AnyOptionalN)nnc            	            e Zd ZdZ	 	 ddedededdf fd	Zd
 ZddeddfdZdddej	        de
ej	                 dej	        fdZ xZS )RotaryPositionalEmbeddingsa,  
    This class implements Rotary Positional Embeddings (RoPE)
    proposed in https://arxiv.org/abs/2104.09864.

    Reference implementation (used for correctness verfication)
    can be found here:
    https://github.com/meta-llama/llama/blob/main/llama/model.py#L80

    In this implementation we cache the embeddings for each position upto
    ``max_seq_len`` by computing this during init.

    Args:
        dim (int): Embedding dimension. This is usually set to the dim of each
            head in the attention module computed as ``embed_dim // num_heads``
        max_seq_len (int): Maximum expected sequence length for the
            model, if exceeded the cached freqs will be recomputed
        base (int): The base for the geometric progression used to compute
            the rotation angles
       '  dimmax_seq_lenbasereturnNc                     t                                                       || _        || _        || _        |                                  d S N)super__init__r
   r   r   	rope_init)selfr
   r   r   	__class__s       y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/position_embeddings.pyr   z#RotaryPositionalEmbeddings.__init__"   sG     		&    c                 
   d| j         t          j        d| j        d          d | j        dz                                           | j        z  z  z  }|                     d|d           |                     | j                   d S Ng      ?r      thetaF
persistent)r   torcharanger
   floatregister_bufferbuild_rope_cacher   r   r   s     r   r   z$RotaryPositionalEmbeddings.rope_init.   s    IQ!,,->A->?EEGG$(RT
 	We>>>d./////r   c                 ^   t          j        || j        j        | j        j                  }t          j        d|| j                                                  }t          j        t          j        |          t          j	        |          gd          }| 
                    d|d           d S )Ndtypedevice
i, j -> ijr
   cacheFr   )r   r   r   r%   r&   einsumr   stackcossinr    )r   r   seq_idx	idx_thetar*   s        r   r!   z+RotaryPositionalEmbeddings.build_rope_cache6   s    ,tz/
8I
 
 
 Lw
CCIIKK	 UYy1159Y3G3GHbQQQWe>>>>>r   )	input_posxr1   c                L   |                     d          }|| j        d|         n| j        |         } |                                j        g |j        dd         ddR  }|                    d|                     d          d|                     d          d          }t          j        |d         |d         z  |d         |d         z  z
  |d         |d         z  |d         |d         z  z   gd          }|                    d          }|	                    |          S )a#  
        Args:
            x (torch.Tensor): input tensor with shape
                ``[b, s, n_h, h_d]``
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b, s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Returns:
            torch.Tensor: output tensor with shape ``[b, s, n_h, h_d]``

        Notation used for tensor shapes:
            - b: batch size
            - s: sequence length
            - n_h: num heads
            - h_d: head dim
           Nr(   r      .r   .r4   )
sizer*   r   reshapeshapeviewr   r,   flattentype_as)r   r2   r1   seq_len
rope_cachexshapedx_outs          r   forwardz"RotaryPositionalEmbeddings.forwardE   s.   . &&)) %.$5DJxx  4:i;P 	 $!''))#9QWSbS\929q999
  __Ra!W\\!__aPP
 *V"44&/Jv$667*V"44&/Jv$667 
 
 a  }}Qr   )r   r	   )r   )__name__
__module____qualname____doc__intr   r   r!   r   Tensorr   rB   __classcell__r   s   @r   r   r      s         .  	
 

 
 	

 

 
 
 
 
 
0 0 0? ?C ?4 ? ? ? ?  GK5  5  5 5 -5el-C5 	5  5  5  5  5  5  5  5 r   r   c                   |     e Zd ZdZ	 	 ddededededed	ed
df fdZd ZddZde	j
        ded
e	j
        fdZ xZS ) VisionRotaryPositionalEmbeddingsa  
    This class implements two-dimensional Rotary Positional Embeddings (RoPE) for images
    based on the axial frequency 2D RoPE described in https://arxiv.org/pdf/2403.13298.

    The position embedding is simply applied to the x-axis and y-axis separately, encoding
    the x and y position of each patch within every tile.. The embedding is applied to each
    tile identically.

    Note: This module assumes the CLS token embedding is appended at the end of the sequence.

    Args:
        patch_size (int): The size of each patch. Used to divide the tiles into patches.
            E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches.
        tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise,
            the size of the full input image. In this case, the function will consider your image as a single tile.
        max_num_tiles (int): The maximum number of tiles in the image. This is used to unfold the input sequence
            length into sequence length per tile so RoPE can be applied to each tile separately.
        dim (int): Embedding dimension. Unlike :class:`~torchtune.modules.RotaryPositionalEmbeddings`, this is
            usually set to the dim of each head in the attention module divided by 2, computed as
            ``embed_dim // num_heads // 2``. The divide by 2 accounts for x and y positions.
        base (int): The base for the geometric progression used to compute
            the rotation angles
        append_cls_token (bool): Set to True if CLS token embedding is at the end of the sequence in the vision transformer,
            False if is in the beginning of the sequence. RoPE is zeroed out for the CLS token. Default is True.
    r	   T
patch_size	tile_sizemax_num_tilesr
   r   append_cls_tokenr   Nc                     t                                                       ||z  | _        || _        || _        || _        || _        |                                  d S r   )r   r   patch_grid_sizerO   r
   r   rP   r   )r   rM   rN   rO   r
   r   rP   r   s          r   r   z)VisionRotaryPositionalEmbeddings.__init__   s\     	(J6*	 0r   c                     d| j         t          j        d| j        d          d | j        dz                                           | j        z  z  z  }|                     d|d           |                                  d S r   )r   r   r   r
   r   r    r!   r"   s     r   r   z*VisionRotaryPositionalEmbeddings.rope_init   s    IQ!,,->A->?EEGG$(RT
 	We>>>r   c           	      z   | j         dz  }t          j        || j        j        | j        j                  }| j        r9t          j        |dt          j        d|j        |j                  z  g          }n8t          j        dt          j        d|j        |j                  z  |g          }|| j         z  }|| j         z  }t          j	        d|dz   | j                  
                                }t          j	        d|dz   | j                  
                                }t          j        ||gd          }|                    |                    d          dk     d          }t          j        t          j        |          t          j        |          gd          }|                     d|d	
           d S )Nr   r$   r(   r4   r'   r)   r   r*   Fr   )rR   r   r   r   r%   r&   rP   catonesr+   r   masked_fill	unsqueezer,   r-   r.   r    )	r   patches_per_tile	patch_idxpatch_x_pospatch_y_posx_thetay_thetafreqsr*   s	            r   r!   z1VisionRotaryPositionalEmbeddings.build_rope_cache   s   /2LDJ$4TZ=N
 
 
	   		AY_YEUVVVV II 	AY_YEUVVVV I  $"664#77 ,|[1_djIIOOQQ,|[1_djIIOOQQ 	7G,"555!!)"5"5b"9"9A"=qAA UYu--uy/?/?@bIIIWe>>>>>r   r2   kwargsc                    |j         \  }}}}|                                                    || j        d||dz  d          }|                    d          }|| j        j         d         k    r&t          d| d| j        j         d          d          | j                            dd|d|dz  d          }	t          j	        |d         |	d         z  |d	         |	d	         z  z
  |d	         |	d         z  |d         |	d	         z  z   gd          }
|
                    || j        |z  ||          }
|

                    |          S )
a  
        Args:
            x (torch.Tensor): input tensor with shape ``[b, s, n_h, h_d]``
            **kwargs (Any): additional keyword arguments. This is kept to match the forward signature of
                :class:`~torchtune.modules.RotaryPositionalEmbeddings`.

        Returns:
            torch.Tensor: output tensor with shape ``[b, s, n_h, h_d]``

        Raises:
            ValueError: if sequence length of input tensor does not match the 2D RoPE cache's sequence length

        Notation used for tensor shapes:
            - b: batch size
            - s: sequence length
            - n_h: num heads
            - h_d: head dim
        r(   r   r   zInput sequence length z. does not match 2D RoPE cache sequence length .r4   r6   r7   )r:   r   r9   rO   r8   r*   
ValueErrorr;   r   r,   r=   )r   r2   r`   bsz_n_hh_dr@   r>   r?   rA   s              r   rB   z(VisionRotaryPositionalEmbeddings.forward   s]   . 7QS ''))##C);RcQhPQRR,,q//dj&q)))vvv`d`j`pqr`svvv  
 Z__Q7AsaxCC
 *V"44&/Jv$667*V"44&/Jv$667 
 
 c4#5#?cJJ}}Qr   )r	   T)r   N)rC   rD   rE   rF   rG   boolr   r   r!   r   rH   r   rB   rI   rJ   s   @r   rL   rL   }   s         @ !%   	
    
     "     &? &? &? &?P5 <5  5  
	5  5  5  5  5  5  5  5 r   rL   )typingr   r   r   r   Moduler   rL    r   r   <module>rl      s    !                     m  m  m  m  m  m  m  m `Q  Q  Q  Q  Q ry Q  Q  Q  Q  Q r   