
    Pi38                         d dl Z d dlmZ d dlZd dlmc mZ d dlmZ d dlm	Z	 d dl
mZ  e j        e          Z G d dej                  ZdS )    N)Optional)nn)	_MaskType)KVCachec            '           e Zd ZdZddddddddddd
ded	ed
ededej        dej        dej        dej        deej                 deej                 deej                 dee         dede	de
dee         dee
         dee         ddf& fdZdedej        deddfdZd Z	 d&ddd d!ej        d"eej                 d#ee         d$eej                 dej        f
d%Z xZS )'Gemma2Attentionaz
  
    Adapated from official Google Pytorch Implementation:
    https://github.com/google/gemma_pytorch/blob/80881c2e6e797ef1913a4a705d4b40394791cc58/gemma/model.py#L213
    to match torchtune style.
    A new attention had to be added since nn.functional.scaled_dot_product_attention does allow soft capping
    Args:
        embed_dim (int): embedding dimension for the model
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
            for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
        head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
        q_proj (nn.Module): projection layer for query.
        k_proj (nn.Module): projection layer for key.
        v_proj (nn.Module): projection layer for value.
        output_proj (nn.Module): projection layer for output.
        pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
        q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
            before updating from kv_cache. This means it will only support token wide normalization and not
            batch or sequence wide normalization.
        k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
        kv_cache (Optional[KVCache]): KVCache object used to cache key and value
        max_seq_len (int): maximum sequence length supported by the model.
            This is needed to compute the RoPE Cache. Default: 4096.
        is_causal (bool): sets the default mask to causal when no mask is provided
        attn_dropout (float): dropout value passed onto the
            scaled_dot_product_attention function. This argument is ignored if the
            self.training is False. Default value is 0.0.
        sliding_window_size (Optional[int]): size of the sliding window if None no sliding window is applied
        softcapping (Optional[float]): capping value used for soft caping, if None no capping is performed
        query_pre_attn_scalar (Optional[int]): value used for pre attention normalisation, if None head_dim is used instead
    Raises:
        ValueError:
            If ``num_heads % num_kv_heads != 0``, **or**
            if ``embed_dim % num_heads != 0``, **or**
            if ``attn_dropout < 0`` or ``attn_dropout > 1``, **or**
            if ``q_norm`` is defined without k_norm or vice versa
    Ni   Tg        g      I@)
pos_embeddingsq_normk_normkv_cachemax_seq_len	is_causalattn_dropoutsliding_window_sizesoftcappingquery_pre_attn_scalar	embed_dim	num_headsnum_kv_headshead_dimq_projk_projv_projoutput_projr	   r
   r   r   r   r   r   r   r   r   returnc                   t                                                       ||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          |dk     s|dk    rt          d| d	          t          |
          t          |          z  rt          d
          || _        || _        || _        || _        || _        || _	        || _
        || _        || _        || _        || _        || _        |
| _        || _        |	| _        || _        || _        ||dz  | _        n| j        dz  | _        d| _        d S )Nr   znum_heads (z%) must be divisible by num_kv_heads ()zembed_dim (z") must be divisible by num_heads (   zattn_dropout (z) must be between 0.0 and 1.0z!q and k norm must be set togetherg      F)super__init__
ValueErrorboolr   r   r   r   r   r   r   r   r   r   r   r   r
   r   r	   r   r   scalingcache_enabled)selfr   r   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   	__class__s                      v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/gemma2/_attention.pyr    zGemma2Attention.__init__<   s   , 	|#q((1i 1 1!-1 1 1  
 y A%%+i + +'+ + +  
 !|a//ViVVVWWW<<$v,,& 	B@AAA #("( &" !&, $7 & ,0$6DLL=$.DL
 #    
batch_sizedtypec                     | j         t                              d           dS t          ||| j        | j        |          | _         d| _        dS )aQ  Setup key value caches for attention calculation. If called
        after kv_cache is already setup, this will be skipped.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            max_seq_len (int): maximum sequence length model will be run with.
        NzWKey value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping.)r)   r   r   r   r*   T)r   loggerwarningr   r   r   r$   )r%   r)   r*   r   s       r'   setup_cachezGemma2Attention.setup_cache   sj     =$NNi     $%'!^  DM "&Dr(   c                 d    | j         t          d          | j                                          dS )zzReset the key value caches.

        Raises:
            RuntimeError: if key value caches are not already setup.
        Nz>Key value caches are not setup. Call ``setup_caches()`` first.)r   RuntimeErrorreset)r%   s    r'   reset_cachezGemma2Attention.reset_cache   s=     = P   	r(   )mask	input_posxyr3   r4   c                
   |)t          |t          j                  st          d          |j        \  }}}||j        d         nd}|                     |          }	| j        | j        z  }
|	                    ||| j        |
z  | j	                  }	| j
        | 
                    |	|          }	|	                    dd          }	| j        |                     |	          }	|7| j        | j        st          d          | j        j        }| j        j        }n|                     |          }|                     |          }|                    ||d| j	                  }| j
        | 
                    ||          }|                    ||| j        d| j	                  }|                    ||| j        d| j	                  }| j        | j        k    rF|                    ||| j        |
| j	                  }|                    ||| j        |
| j	                  }|                    ||d| j	                  }|                    ||d| j	                  }|                    dd          }|                    dd          }| j        |                     |          }| j        %| j        r| j                            ||          \  }}|	                    | j                   t          j        |	|                    dd	                    }|Lt          j        t          j        ||ft          j        
                              |j                            }|j         t          j        k    r(t          j!        |"                                dd          }| j#        lt          j$        |          }t          j%        |d| j#        z  dz             t          j        || j#        dz
            z  }t          j!        |dk    |d          }|&                                d	k    r|'                    d          }| j(        (|| j(        z  }t          j)        |          }|| j(        z  }||z   }tU          j+        |,                                d          -                    |	          }t          j        ||          }|                    dd          .                                                    ||d          }| /                    |          S )a  
        Args:
            x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
            y (Optional[torch.Tensor]): second input tensor with shape [b x s_y x d], is the input
                for k and v. For self attention, x=y. Optional only with kv_cache enabled.
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b x s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.

        Raises:
            NotImplementedError: If ``mask`` is provided, but mask is not an instance of ``torch.Tensor``.
            ValueError: If no ``y`` input and ``kv_cache`` is not enabled.

        Returns:
            torch.Tensor: output tensor with attention applied

        Notation used for tensor shapes:
            - b: batch size
            - s_x: sequence length for x
            - s_y: sequence length for y
            - n_h: num heads
            - n_kv: num kv heads
            - d: embed dim
            - h_d: head dim
        Nz5Block masks are not implemeted yet, use packed=False.r   r   )r4      zAMust provide y input or use kv_cache to enable streaming decoding   )sizer*   g<ff)dim)0
isinstancetorchTensorNotImplementedErrorshaper   r   r   viewr   r	   	transposer
   r   r$   r!   k_cachev_cacher   r   expandreshaper   updatemul_r#   matmultrilonesr"   todevicer*   wherelogical_notr   	ones_liketriur<   	unsqueezer   tanhFsoftmaxfloattype_as
contiguousr   )r%   r5   r6   r3   r4   bs_x_s_yqq_per_kvkvoutputall_onessliding_masks                   r'   forwardzGemma2Attention.forward   s   b Zel%C%C%G   G	3Magajjq KKNN >T%66FF1c4,x7GG *##A#;;A KK1 ;"AA9}$D,>$ W   %A%AA AAAA q#r4=11A".''Y'?? q#t0!T]CCAq#t0!T]CCA ~!222HHQT%6$-PPHHQT%6$-PP 		!S"dm44A		!S"dm44A Aq!!AAq!!A {&KKNN }(T-?(}++Aq111	t|q{{1a  
 

 <:
s*   "QX,,	 D :##;t//11=!DDD#/t,,H :"t77!; 
8T%=%ABBCL ;|q0$FFD88::?? >>!$$D'd..FZ''Fd..F$6<<>>r222::1== fa(( !!!Q''224499!S"EE'''r(   )N)__name__
__module____qualname____doc__intr   Moduler   r   r"   rW   r    r>   r*   r.   r2   r?   r   re   __classcell__)r&   s   @r'   r   r      sF       & &f /3&*&*&*!-1'+/3)G# G# G# G# 	G#
 G# G# 	G# 	G# 	G# YG# !+G# #G# #G# 7#G# G#  !G#" #G#$ &c]%G#& e_'G#(  (})G#* 
+G# G# G# G# G# G#R&&&+k&@C&	& & & &4
 
 
 %)n(
 %),0n( n( n(<n( EL!n(
 y!n( EL)n( 
n( n( n( n( n( n( n( n(r(   r   )loggingtypingr   r>   torch.nn.functionalr   
functionalrU   !torchtune.modules.attention_utilsr   torchtune.modules.kv_cacher   	getLoggerrf   r,   rk   r    r(   r'   <module>ru      s                           7 7 7 7 7 7 . . . . . .		8	$	$F( F( F( F( F(bi F( F( F( F( F(r(   