
    PiE                     0   d dl Z d dlmZ d dlZd dlmc mZ d dlmZ d dl	m
Z d dlmZmZ d dlm
Z
  e j        e          Z G d dej                  Z G d d	ej                  Zd
ej        j        ddfdZd
ej        j        dej        j        fdZdS )    N)Optional)nn)KVCache)	_MaskType_sdpa_or_flex_attentionc            !       x    e Zd ZdZdddddddddeded	ed
edej        dej        dej        dej        deej                 deej                 deej                 dee         dede	de
ddf  fdZdedej        deddfdZd Zddddej        dej        dee         d eej                 dej        f
d!Z xZS )"MultiHeadAttentionuN  
    NOTE: torch.export.export() friendly MultiHeadAttention, modified from
    torchtune.modules.attention.MultiHeadAttention
    Major differences:
    - Rewrite `if y is None` to torch.cond().
      - Logic becomes `if all values of y are NaN`, to make torch.compile() happy.
      - No input mutations in both false and true branches, so we need to copy kv
        values back into kv cache after torch.cond().
    - Added a SDPA module
      - SDPA module includes transpose and expanding kv dimensions.
      - Makes it easy to swap with custom SDPAs that are needed by the users of exported
        program.
    - Uses new kv cache
      - This potentially can be merged with torchtune.modules.kv_cache.
      - Changed += to .add_ to avoid mutating module attributes.
      - Added clone() method.

    Multi-headed attention layer with support for grouped query
    attention (GQA) introduced in https://arxiv.org/abs/2305.13245v1.

    GQA is a version of multiheaded attention (MHA) which uses fewer
    key/value heads than query heads by grouping n query heads for each
    key and value head. Multi-Query Attention is an extreme
    version where we have a single key and value head shared by all
    query heads.

    Following is an example of MHA, GQA and MQA with num_heads = 4

    (credit for the documentation:
    `litgpt.Config <https://github.com/Lightning-AI/litgpt/blob/eda1aaaf391fd689664f95487ab03dc137e213fd/litgpt/config.py>`_).


    ::

        ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
        │ v ││ v ││ v ││ v │     │ v │    │ v │             │ v │
        └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
        │    │    │    │         │        │                 │
        ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
        │ k ││ k ││ k ││ k │     │ k │    │ k │             │ k │
        └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
        │    │    │    │      ┌──┴──┐  ┌──┴──┐      ┌────┬──┴─┬────┐
        ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐
        │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │
        └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘
        ◀──────────────────▶  ◀──────────────────▶  ◀──────────────────▶
                MHA                    GQA                   MQA
        n_kv_heads =4          n_kv_heads=2           n_kv_heads=1

    Args:
        embed_dim (int): embedding dimension for the model
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
            for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
        head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
        q_proj (nn.Module): projection layer for query.
        k_proj (nn.Module): projection layer for key.
        v_proj (nn.Module): projection layer for value.
        output_proj (nn.Module): projection layer for output.
        pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
        q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
            before updating from kv_cache. This means it will only support token wide normalization and not
            batch or sequence wide normalization.
        k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
        kv_cache (Optional[KVCache]): KVCache object used to cache key and value
        max_seq_len (int): maximum sequence length supported by the model.
            This is needed to compute the RoPE Cache. Default: 4096.
        is_causal (bool): sets the default mask to causal when no mask is provided
        attn_dropout (float): dropout value passed onto the scaled_dot_product_attention function.
            Default value is 0.0.

    Raises:
        ValueError: If ``num_heads % num_kv_heads != 0``
        ValueError: If ``embed_dim % num_heads != 0``
        ValueError: If ``attn_dropout < 0`` or ``attn_dropout > 1``
        ValueError: if q_norm is defined without k_norm or vice versa
    Ni   T        )pos_embeddingsq_normk_normkv_cachemax_seq_len	is_causalattn_dropout	embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_projr   r   r   r   r   r   r   returnc          	         t                                                       ||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          |dk     s|dk    rt          d| d	          t          |
          t          |          z  rt          d
          || _        || _        || _        || _        || _        || _	        || _
        || _        || _        || _        || _        || _        |
| _        || _        |	| _        t'                      | _        t+          | j        | j        | j        | j        r| j        nd| j
        | j        | j                  | _        d| _        d S )Nr   znum_heads (z%) must be divisible by num_kv_heads ()zembed_dim (z") must be divisible by num_heads (   zattn_dropout (z) must be between 0.0 and 1.0z!q and k norm must be set togetherr
   )r   r   r   r   r   attention_fnr   F)super__init__
ValueErrorboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _attention_callSDPAtraining_sdpacache_enabled)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__s                   w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/_export/attention.pyr    zMultiHeadAttention.__init__f   s   & 	|#q((1i 1 1!-1 1 1  
 y A%%+i + +'+ + +  
 !|a//ViVVVWWW<<$v,,& 	B@AAA #("( &" !&,  788*n].2mD**n-]
 
 

 #    
batch_sizedtypec                     | j         t                              d           dS t          ||| j        | j        |d          | _         | j         | j        _         d| _        dS )aQ  Setup key value caches for attention calculation. If called
        after kv_cache is already setup, this will be skipped.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            max_seq_len (int): maximum sequence length model will be run with.
        NzWKey value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping.F)r,   r   r   r   r-   transpose_cacheT)r   loggerwarningInferenceKVCacher   r   r&   r'   )r(   r,   r-   r   s       r*   setup_cachezMultiHeadAttention.setup_cache   sz     =$NNi     -%'!. %  DM #'-DJ!%Dr+   c                 d    | j         t          d          | j                                          dS )zReset the key value caches.Nz>Key value caches are not setup. Call ``setup_caches()`` first.)r   RuntimeErrorreset)r(   s    r*   reset_cachezMultiHeadAttention.reset_cache   s;    = P   	r+   )mask	input_posxyr8   r9   c                L    |j         \  }}                     |          } j         j        z  }|                    | j        |z   j                  } j                             |          } j                             |          } fd fd}	 fd}
 j        |
J d             |          \  }}nt          j
        t          j        |                                                                          |	|
|f          \  }}} j        j                            |            j        j                            |            j        j                            |                                |||||          }                     |          S )a*  
        Args:
            x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
            y (torch.Tensor): second input tensor with shape [b x s_y x d], is the input
                for k and v. For self attention, x=y. If all values are NaN, we read from kv cache.
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b x s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Returns:
            torch.Tensor: output tensor with attention applied

        Notation used for tensor shapes:
            - b: batch size
            - s_x: sequence length for x
            - s_y: sequence length for y
            - n_h: num heads
            - n_kv: num kv heads
            - d: embed dim
            - h_d: head dim
        Nr9   c                 b   | j         d         }                    |           }                    |           }|                    |dj                  }|                    |dj                  }j                            |          }j                            |          }||fS )Nr   r=   )shaper   r   viewr   r   r   )r;   s_ykvbr9   r(   s       r*   calculate_kvz0MultiHeadAttention.forward.<locals>.calculate_kv  s    '!*C AAAA q#r4=11Aq#r4=11A".''Y'?? {&KKNNa4Kr+   c                 ^    j                                         }|j        |j        |j        fS N)r   clonek_cachev_cache	cache_pos)r;   r   r(   s     r*   true_fnz+MultiHeadAttention.forward.<locals>.true_fn'  s,    }**,,H#X%5x7IIIr+   c                      |           \  }}j                                         }|                    ||           |j        |j        |j        fS rH   )r   rI   updaterJ   rK   rL   )r;   rC   rD   r   rF   r(   s       r*   false_fnz,MultiHeadAttention.forward.<locals>.false_fn+  sQ    <??DAq}**,,HOOAq!!!#X%5x7IIIr+   zAMust provide y input or use kv_cache to enable streaming decoding)r8   )r@   r   r   r   rA   r   r   r   r   torchcondisnanallitemrJ   copy_rK   rL   r&   r   )r(   r:   r;   r8   r9   s_x_qq_per_kvrM   rP   rC   rD   rL   outputrE   rF   s   `   `          @@r*   forwardzMultiHeadAttention.forward   s   \ G	3 KKNN >T%66FF1c4,x7GG *##A#;;A ;"AA	 	 	 	 	 	 	(	J 	J 	J 	J 	J	J 	J 	J 	J 	J 	J = R <??DAqq
 $jA""$$))++Wh OAq) M!''***M!''***M#)))444Aq!Q$77'''r+   )__name__
__module____qualname____doc__intr   Moduler   r   r"   floatr    rQ   r-   r3   r7   Tensorr   r\   __classcell__r)   s   @r*   r	   r	      s       N Nv /3&*&*&*!#H# H# H# H# 	H#
 H# H# 	H# 	H# 	H# YH# !+H# #H# #H# 7#H# H#  !H#" #H#$ 
%H# H# H# H# H# H#T&&&+k&@C&	& & & &8   %),0p( p( p(<p( <p(
 y!p( EL)p( 
p( p( p( p( p( p( p( p(r+   r	   c                        e Zd ZdZdedededededdf fd	Z	 dd
ej	        dej	        dej	        dedede
e         dej	        fdZ xZS )r$   zr
    TorchTune's SDPA which can be optimized and can be swapped
    out for a more efficient implementations.
    r   r   r   r   r   r   Nc                     t                                                       || _        || _        || _        | j        | j        z  | _        || _        || _        || _        || _	        d S rH   )
r   r    r   r   r   rZ   r   r   _attention_fnr   )	r(   r   r   r   r   r   r   r   r)   s	           r*   r    zSDPA.__init__M  sd     	(" $*;;(") r+   rY   rC   rD   bszseq_lenr8   c           	         |                     dd          }|                     dd          }|                     dd          }| j        | j        k    rdd| j        ddf}|                    d                              |                              dd          }|                    d                              |                              dd          }|                     ||||| j        | j	        d u o
|d u o| j
                  }|                     dd                                                              ||d          S )Nr      r?   )r8   	dropout_pr   )	transposer   r   rZ   	unsqueezeexpandflattenri   r   r   r   
contiguousrA   )	r(   rY   rC   rD   rj   rk   r8   expand_shaper[   s	            r*   r\   zSDPA.forwarda  s7    KK1KK1KK1 >T...DM2r:LA%%l33;;AqAAAA%%l33;;AqAAA##'mt+OO $ 
 
 1%%002277WbIIIr+   rH   )r]   r^   r_   r`   ra   rc   r"   r    rQ   rd   r   r   r\   re   rf   s   @r*   r$   r$   G  s         
!! ! 	!
 ! ! 
! ! ! ! ! !6 %)!J !J<!J <!J <	!J
 !J !J y!!J 
!J !J !J !J !J !J !J !Jr+   r$   moduler   c                 ~   |                                  D ]\  }}t          |t          j                  ryt	          | |t          |j        |j        |j        |j        |j	        |j
        |j        |j        |j        |j        |j        |j        |j        |j        |j                             t)          |           d S )N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )named_children
isinstanceTorchTuneAttentionr	   setattrr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   replace_mha_with_inference_mha)ru   namechilds      r*   _replace_mha_with_inference_mhar~     s    ,,.. 2 2ee/BCC 	2"#o#o!&!3"^ < < < % 1#(#7 < <"^ % 1#o!&!3     , +5111112 2r+   c                 $    t          |            | S )z
    Replace TorchTune's MHA with an inference friendly version of MHA that
    separates out the inference-related parts for further optimization.
    )r~   )ru   s    r*   r{   r{     s    
 $F+++Mr+   )loggingtypingr   rQ   torchtune.modules.attentionmodules	attentionry   r   "torchtune.modules._export.kv_cacher   r2   !torchtune.modules.attention_utilsr   r   torchtune.modules.kv_cache	getLoggerr]   r0   rb   r	   r$   r~   r{    r+   r*   <module>r      sj           8 8 8 8 8 8 8 8 8       J J J J J J P P P P P P P P . . . . . .		8	$	$o( o( o( o( o( o( o( o(d	;J ;J ;J ;J ;J29 ;J ;J ;J|2EHO 2 2 2 2 2858? ux      r+   