
    Pi4                         d dl Z d dlmZ d dlZd dlmZ d dlmZmZ d dlm	Z	  e j
        e          Z G d dej                  ZdS )    N)Optional)nn)	_MaskType_sdpa_or_flex_attention)KVCachec            !           e Zd ZdZdddddddddeded	ed
edej        dej        dej        dej        deej                 deej                 deej                 dee         dede	de
ddf  fdZdedej        deddfdZd Z	 d"ddddej        deej                 dee         d eej                 dej        f
d!Z xZS )#MultiHeadAttentionu/  Multi-headed attention layer with support for grouped query
    attention (GQA) introduced in https://arxiv.org/abs/2305.13245v1.

    GQA is a version of multiheaded attention (MHA) which uses fewer
    key/value heads than query heads by grouping n query heads for each
    key and value head. Multi-Query Attention is an extreme
    version where we have a single key and value head shared by all
    query heads.

    Following is an example of MHA, GQA and MQA with num_heads = 4

    (credit for the documentation:
    `litgpt.Config <https://github.com/Lightning-AI/litgpt/blob/eda1aaaf391fd689664f95487ab03dc137e213fd/litgpt/config.py>`_).


    ::

        ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
        │ v ││ v ││ v ││ v │     │ v │    │ v │             │ v │
        └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
        │    │    │    │         │        │                 │
        ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
        │ k ││ k ││ k ││ k │     │ k │    │ k │             │ k │
        └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
        │    │    │    │      ┌──┴──┐  ┌──┴──┐      ┌────┬──┴─┬────┐
        ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐
        │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │
        └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘
        ◀──────────────────▶  ◀──────────────────▶  ◀──────────────────▶
                MHA                    GQA                   MQA
        n_kv_heads =4          n_kv_heads=2           n_kv_heads=1

    Args:
        embed_dim (int): embedding dimension for the model
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
            for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
        head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
        q_proj (nn.Module): projection layer for query.
        k_proj (nn.Module): projection layer for key.
        v_proj (nn.Module): projection layer for value.
        output_proj (nn.Module): projection layer for output.
        pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
        q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
            before updating from kv_cache. This means it will only support token wide normalization and not
            batch or sequence wide normalization.
        k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
        kv_cache (Optional[KVCache]): KVCache object used to cache key and value
        max_seq_len (int): maximum sequence length supported by the model.
            This is needed to compute the RoPE Cache. Default: 4096.
        is_causal (bool): sets the default mask to causal when no mask is provided
        attn_dropout (float): dropout value passed onto the scaled_dot_product_attention function.
            Default value is 0.0.

    Raises:
        ValueError:
            If ``num_heads % num_kv_heads != 0``, **or**
            if ``embed_dim % num_heads != 0``, **or**
            if ``attn_dropout < 0`` or ``attn_dropout > 1``, **or**
            if q_norm is defined without k_norm or vice versa
    Ni   T        )pos_embeddingsq_normk_normkv_cachemax_seq_len	is_causalattn_dropout	embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_projr   r   r   r   r   r   r   returnc                d   t                                                       ||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          |dk     s|dk    rt          d| d	          t          |
          t          |          z  rt          d
          || _        || _        || _        || _        || _        || _	        || _
        || _        || _        || _        || _        || _        |
| _        || _        |	| _        t'                      | _        d| _        d S )Nr   znum_heads (z%) must be divisible by num_kv_heads ()zembed_dim (z") must be divisible by num_heads (   zattn_dropout (z) must be between 0.0 and 1.0z!q and k norm must be set togetherF)super__init__
ValueErrorboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _attention_callcache_enabled)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__s                   o/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/attention.pyr   zMultiHeadAttention.__init__S   s   & 	|#q((1i 1 1!-1 1 1  
 y A%%+i + +'+ + +  
 !|a//ViVVVWWW<<$v,,& 	B@AAA #("( &" !&,  788
 #    
batch_sizedtypec                     | j         t                              d           dS t          ||| j        | j        |          | _         d| _        dS )aQ  Setup key value caches for attention calculation. If called
        after kv_cache is already setup, this will be skipped.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            max_seq_len (int): maximum sequence length model will be run with.
        NzWKey value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping.)r(   r   r   r   r)   T)r   loggerwarningr   r   r   r#   )r$   r(   r)   r   s       r&   setup_cachezMultiHeadAttention.setup_cache   sk     =$NNi     $%'!.  DM "&Dr'   c                 d    | j         t          d          | j                                          dS )zReset the key value caches.Nz>Key value caches are not setup. Call ``setup_caches()`` first.)r   RuntimeErrorreset)r$   s    r&   reset_cachezMultiHeadAttention.reset_cache   s;    = P   	r'   )mask	input_posxyr2   r3   c          	         |j         \  }}}||j         d         nd}|                     |          }	| j        | j        z  }
|	                    ||| j        |
z  | j                  }	| j        |                     |	|          }	|	                    dd          }	| j        |                     |	          }	|6| j	        | j
        st          d          | j	        j        }| j	        j        }n|                     |          }|                     |          }|                    ||d| j                  }|                    ||d| j                  }| j        |                     ||          }|                    dd          }|                    dd          }| j        |                     |          }| j	        %| j
        r| j	                            ||          \  }}| j        | j        k    r|| j        |
d| j        f}|                    d                              |                              dd          }|                    d                              |                              dd          }|                     |	|||| j        r| j        nd| j	        du o
|du o| j        	          }|                    dd                                                              ||d          }|                     |          S )
a  
        Args:
            x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
            y (Optional[torch.Tensor]): second input tensor with shape [b x s_y x d], is the input
                for k and v. For self attention, x=y. Optional only with kv_cache enabled.
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.decoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b x s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Raises:
            ValueError: If no ``y`` input and ``kv_cache`` is not enabled.

        Returns:
            torch.Tensor: output tensor with attention applied

        Notation used for tensor shapes:
            - b: batch size
            - s_x: sequence length for x
            - s_y: sequence length for y
            - n_h: num heads
            - n_kv: num kv heads
            - d: embed dim
            - h_d: head dim
        Nr   r   )r3      zAMust provide y input or use kv_cache to enable streaming decodingr
   )r2   	dropout_pr   )shaper   r   r   viewr   r   	transposer   r   r#   r    k_cachev_cacher   r   r   update	unsqueezeexpandflattenr"   trainingr   r   
contiguousr   )r$   r4   r5   r2   r3   bs_x_s_yqq_per_kvkvexpand_shapeoutputs                  r&   forwardzMultiHeadAttention.forward   s   b G	3Magajjq KKNN >T%66FF1c4,x7GG *##A#;;A KK1 ;"AA9}$D,>$ W   %A%AA
 AAAA q#r4=11Aq#r4=11A".''Y'?? Aq!!AAq!!A {&KKNN }(T-?(}++Aq111
 >T...t0(BNLA%%l33;;AqAAAA%%l33;;AqAAA%%+/=Ad''cmt+OO & 
 
 !!!Q''224499!S"EE'''r'   )N)__name__
__module____qualname____doc__intr   Moduler   r   r!   floatr   torchr)   r-   r1   Tensorr   rO   __classcell__)r%   s   @r&   r	   r	      s       > >V /3&*&*&*!#?# ?# ?# ?# 	?#
 ?# ?# 	?# 	?# 	?# Y?# !+?# #?# #?# 7#?# ?#  !?#" #?#$ 
%?# ?# ?# ?# ?# ?#B&&&+k&@C&	& & & &4   %)z(
 %),0z( z( z(<z( EL!z(
 y!z( EL)z( 
z( z( z( z( z( z( z( z(r'   r	   )loggingtypingr   rW   r   !torchtune.modules.attention_utilsr   r   torchtune.modules.kv_cacher   	getLoggerrP   r+   rU   r	    r'   r&   <module>r`      s                  P P P P P P P P . . . . . .		8	$	$^( ^( ^( ^( ^( ^( ^( ^( ^( ^(r'   