
    Piyn                         d dl Z d dlmZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ  G d dej                  Z G d d	ej                  Zd
ej        dedej        fdZ G d dej                  ZdS )    N)CallableDictListOptionalUnion)nn)MultiHeadAttention)	_MaskTypec                   L    e Zd ZdZddddddedej        deej                 deej                 deej                 d	eej                 d
df fdZde	de
j        de	de	d
df
dZd
efdZd
efdZd Zdddde
j        dee         dee
j                 ded
e
j        f
dZ xZS )TransformerSelfAttentionLayeraF  
    Transformer layer derived from the Llama2 model. Normalization is applied before the attention **and** FF layer.

    Args:
        attn (MultiHeadAttention): Attention module.
        mlp (nn.Module): Feed-forward module.
        sa_norm (Optional[nn.Module]): Normalization to be applied before self-attention.
        mlp_norm (Optional[nn.Module]): Normalization to be applied before the feed-forward layer.
        sa_scale (Optional[nn.Module]): Module to scale self-attention output.
        mlp_scale (Optional[nn.Module]): Module to scale the feed-forward output.
    N)sa_normmlp_normsa_scale	mlp_scaleattnmlpr   r   r   r   returnc                4   t                                                       || _        || _        |pt	          j                    | _        |pt	          j                    | _        |pt	          j                    | _        |pt	          j                    | _	        d S N)
super__init__r   r   r   Identityr   r   r   r   )selfr   r   r   r   r   r   	__class__s          q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/modules/transformer.pyr   z&TransformerSelfAttentionLayer.__init__   su     		/"+-- 1BKMM 1BKMM"3bkmm    
batch_sizedtypeencoder_max_seq_lendecoder_max_seq_lenc                @    | j                             |||           dS )aO  Setup key value caches for attention calculation.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            encoder_max_seq_len (int): this parameter is ignored in this layer.
            decoder_max_seq_len (int): maximum cache sequence length.
        )max_seq_lenNr   setup_cacher   r   r   r   r    s        r   setup_cachesz*TransformerSelfAttentionLayer.setup_caches.   s(      		j%=PQQQQQr   c                     | j         j        duS z
        Check if the key value caches are setup on ``self.attn``.
        See :func:~torchtune.modules.TransformerDecoder.caches_are_setup`.
        Nr   kv_cacher   s    r   caches_are_setupz.TransformerSelfAttentionLayer.caches_are_setup@       
 y!--r   c                     | j         j        S z
        Checks if the key value caches on ``self.attn`` are enabled.
        See :func:~torchtune.modules.TransformerDecoder.caches_are_enabled`.
        r   cache_enabledr+   s    r   caches_are_enabledz0TransformerSelfAttentionLayer.caches_are_enabledG       
 y&&r   c                 8    | j                                          dS zReset the key value caches.Nr   reset_cacher+   s    r   r7   z)TransformerSelfAttentionLayer.reset_cacheN       	r   mask	input_posxr:   r;   kwargsc                   |                      |          }|                     ||||          }|                     |          |z   }|                     |                     |                    }||                     |          z   }|S )a  
        Args:
            x (torch.Tensor): input tensor with shape
                [batch_size x seq_length x embed_dim]
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape [b x s].
                During inference, this indicates the position of the current token.
                If none, assume the index of the token is its position id. Default is None.
            **kwargs (Dict): transformer layer inputs not relevant to self attention.

        Returns:
            torch.Tensor: output tensor with same shape as input
                [batch_size x seq_length x embed_dim]
        r9   )r   r   r   r   r   r   )	r   r<   r:   r;   r=   hattn_outmlp_outouts	            r   forwardz%TransformerSelfAttentionLayer.forwardR   s    N LLOO99Q	9BB MM(##a' ((4==++,, $..)))
r   )__name__
__module____qualname____doc__r	   r   Moduler   r   inttorchr   r&   boolr,   r2   r7   Tensorr
   r   rC   __classcell__r   s   @r   r   r      s       
 
" (,(,(,)-4 4 4 4 Y4
 ")$4 29%4 29%4 BI&4 
4 4 4 4 4 4$RR {R
 !R !R 
R R R R$.$ . . . .'D ' ' ' '      %),02 2 2<2 y!	2
 EL)2 2 
2 2 2 2 2 2 2 2r   r   c                       e Zd ZdZddddddedej        deej                 deej                 deej                 d	eej                 d
df fdZde	de
j        de	de	d
df
dZd
efdZd
efdZd Zdee
j                 d
ee
j                 fdZdddde
j        dee
j                 dee
j                 ded
e
j        f
dZ xZS )TransformerCrossAttentionLayera  
    Cross attention Transformer layer following the same conventions as the TransformerSelfAttentionLayer.
    Normalization is applied before the attention **and** FF layer.

    Args:
        attn (MultiHeadAttention): Attention module.
        mlp (nn.Module): Feed-forward module.
        ca_norm (Optional[nn.Module]): Normalization to be applied before cross-attention.
        mlp_norm (Optional[nn.Module]): Normalization to be applied before the feed-forward layer.
        ca_scale (Optional[nn.Module]): Module to scale cross-attention output.
        mlp_scale (Optional[nn.Module]): Module to scale the feed-forward output.

    Raises:
        AssertionError: if attn.pos_embeddings is set.
    N)ca_normr   ca_scaler   r   r   rQ   r   rR   r   r   c                `   t                                                       |j        t          d          || _        || _        |pt          j                    | _        |pt          j                    | _	        |pt          j                    | _
        |pt          j                    | _        d S )NzsDoesn't support positional embeddings for cross attention,                 because q and k are different sequences.)r   r   pos_embeddingsAssertionErrorr   r   r   r   rQ   r   rR   r   )r   r   r   rQ   r   rR   r   r   s          r   r   z'TransformerCrossAttentionLayer.__init__   s     	* :   	/"+-- 1BKMM 1BKMM"3bkmmr   r   r   r   r    c                >    | j                             |||           dS )aO  Setup key value caches for attention calculation.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            encoder_max_seq_len (int): maximum cache sequence length.
            decoder_max_seq_len (int): this parameter is ignored in this layer.
        Nr#   r%   s        r   r&   z+TransformerCrossAttentionLayer.setup_caches   s%      		j%1DEEEEEr   c                     | j         j        duS r(   r)   r+   s    r   r,   z/TransformerCrossAttentionLayer.caches_are_setup   r-   r   c                     | j         j        S r/   r0   r+   s    r   r2   z1TransformerCrossAttentionLayer.caches_are_enabled   r3   r   c                 8    | j                                          dS r5   r6   r+   s    r   r7   z*TransformerCrossAttentionLayer.reset_cache   r8   r   r:   c                     |dS |j         t          j        k    r| }nt          j        |          }t          j        |dd          }|S )a0  Some tokens in x may not attend to any encoder inputs
        due to the cross attention mask (encoder_mask). This results in
        a full row of the attention matrix being masked out.

        In the example below, the word "the" is masked from every embedding.
        The False value means a token can't attend to an embedding.

        .. code-block:: text

            |emb||emb||emb|
        |The| F    F    F
        |red| T    F    T
        |car| F    T    T

        This results in no inputs into the softmax layer which causes a NaN.
        The skip mask is used to mask the outputs of attention and
        mlp resulting in the token being skipped.

        The above example would result in a skip mask of: [[True], [False], [False]]
        which specifies which tokens to fully mask out.

        NT)dimkeepdim)r   rJ   rK   isneginfall)r   r:   s     r   
_skip_maskz)TransformerCrossAttentionLayer._skip_mask   sP    0 <4:##5DD>$''Dy2t444r   )encoder_inputencoder_maskr<   ra   rb   r=   c                &   |                                   p| j        j        j        dk    }||r|S |                     |          }||                    |d          }|                     |                     |          ||          }||                    |d          }|                     |          |z   }|                     | 	                    |                    }	||	                    |d          }	|| 
                    |	          z   }
|
S )aZ  
        Args:
            x (torch.Tensor): input tensor with shape
                [batch_size x seq_length x embed_dim]
            encoder_input (Optional[torch.Tensor]): Optional input embeds from the encoder. Shape
                [batch_size x token_sequence x embed_dim]
            encoder_mask (Optional[torch.Tensor]):  Boolean tensor defining a relational matrix between
                tokens and encoder embeddings. A True value at position i,j means token i can attend
                to embedding j in the decoder. Mask has shape [batch_size x token_sequence x embed_sequence].
                Default is None.
            **kwargs (Dict): transformer layer inputs not relevant to self attention.

        Returns:
            torch.Tensor: output tensor with same shape as input
                [batch_size x seq_length x embed_dim]
        r   NT)r:   )r2   r   r*   sizer`   masked_fillrQ   rR   r   r   r   )r   r<   ra   rb   r=   empty_cache	skip_maskr@   r?   rA   rB   s              r   rC   z&TransformerCrossAttentionLayer.forward   s   4 11333Sty7I7NRS7S  [ H OOL11	# (33ItDDL 99T\\!__m,9OO ++Iq99H MM(##a' ((4==++,, )))Q77G $..)))
r   )rD   rE   rF   rG   r	   r   rH   r   r   rI   rJ   r   r&   rK   r,   r2   r7   rL   r`   r   rC   rM   rN   s   @r   rP   rP      s        * (,(,(,)-4 4 4 4 Y4
 ")$4 29%4 29%4 BI&4 
4 4 4 4 4 4.FF {F
 !F !F 
F F F F$.$ . . . .'D ' ' ' '     !x5 !(5<:P ! ! ! !N 15/3: : :<:  -	:
 u|,: : 
: : : : : : : :r   rP   modulenr   c                 ^     t          j         fdt          |          D                       S )z
    Return a list of ``n`` identical layers.

    Args:
        module (nn.Module): module to be cloned
        n (int): number of clones

    Returns:
        nn.ModuleList: list of ``n`` identical layers
    c                 8    g | ]}t          j                  S  )copydeepcopy).0irh   s     r   
<listcomp>z_get_clones.<locals>.<listcomp>?  s#    BBBA$-//BBBr   )r   
ModuleListrange)rh   ri   s   ` r   _get_clonesrt   3  s0     =BBBBqBBBCCCr   c                       e Zd ZdZddddej        deej        eej                 ej	        f         de
de
de
d	ej        d
eej        ef         dee
         deee
                  ddf fdZde
ddfdZdddde
dej        dee
         dee
         fdZdefdZdefdZd Zej        j        dej        deej                 fd            Z	 	 	 	 d&de
deej                 deej                 deej                 d eej                 f
d!Zddddd"d#ej        dee         deej                 deej                 d eej                 deej        eej                 f         fd$Zd% Z xZS )'TransformerDecodera  
    Transformer Decoder derived from the Llama2 architecture.

    Args:
        tok_embeddings (nn.Embedding): PyTorch embedding layer, to be used to move
            tokens to an embedding space.
        layers (Union[nn.Module, List[nn.Module], nn.ModuleList]): A single transformer Decoder layer, an
            nn.ModuleList of layers or a list of layers. It is recommended to use an nn.ModuleList.
        max_seq_len (int): maximum sequence length the model will be run with, as used
            by :func:`~torchtune.modules.KVCache`
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value. This is used to setup the
            :func:`~torchtune.modules.KVCache`
        head_dim (int): embedding dimension for each head in self-attention. This is used
            to setup the :func:`~torchtune.modules.KVCache`
        norm (nn.Module): Callable that applies normalization to the output of the decoder,
            before final MLP.
        output (Union[nn.Linear, Callable]): Callable that applies a linear transformation to the output of
            the decoder.
        num_layers (Optional[int]): Number of Transformer Decoder layers, only define when
            layers is not a list.
        output_hidden_states (Optional[List[int]]): List of layers (indices) to include in the output

    Raises:
        AssertionError:
            If ``num_layers`` is set and layer is a list, **or**
            ``num_layers`` is not set and layer is an ``nn.Module``.

    Note:
        Arg values are checked for correctness (eg: ``attn_dropout`` belongs to [0,1])
        in the module where they are used. This helps reduces the number of raise
        statements in code and improves readability.
    N)
num_layersoutput_hidden_statestok_embeddingslayersr"   	num_headshead_dimnormoutputrw   rx   r   c       	            t                                                       t          |t          j                  rntt          |t
                    rt          j        |          }nJt          |t          j                  st          d          |t          d          t          ||          }|| _	        || _
        || _        || _        |	pg | _        || _        || _        || _        d | _        d| _        d | _        d | _        d S )Nz.num_layers is defined, layers must be a modulez0num_layers is not defined, layers must be a listr   )r   r   
isinstancer   rr   listrH   rU   rt   ry   rz   r}   r~   rx   r"   r{   r|   causal_masknum_output_chunksencoder_max_cache_seq_lendecoder_max_cache_seq_len)r   ry   rz   r"   r{   r|   r}   r~   rw   rx   r   s             r   r   zTransformerDecoder.__init__e  s    	fbm,, 		5%% 	5]6**FFfbi00 W$%UVVV!$%WXXX 44F,	$8$>B!&" !" *.&)-&&&r   r   c                     || _         dS )zUsed to save memory in combination with :class:`~torchtune.modules.loss.CEWithChunkedOutputLoss`.
        This should be called before the first forward pass, in the recipe.N)r   )r   r   s     r   set_num_output_chunksz(TransformerDecoder.set_num_output_chunks  s     "3r   r   r    r   r   r   r    c                l   t          d |                                 D                       }t          d |                                 D                       }|r||| _        n| j        | _        |r||| _        n| j        | _        | j        D ]%}|                    ||| j        | j                   &dS )a  
        Sets up key-value attention caches for inference. For each layer in ``self.layers``:
            - :class:`~torchtune.modules.TransformerSelfAttentionLayer` will use ``decoder_max_seq_len``.
            - :class:`~torchtune.modules.TransformerCrossAttentionLayer` will use ``encoder_max_seq_len``.
            - :class:`~torchtune.modules.model_fusion.FusionLayer` will use ``decoder_max_seq_len`` and ``encoder_max_seq_len``.

        Args:
            batch_size (int): batch size for the caches.
            dtype (torch.dtype): dtype for the caches.
            encoder_max_seq_len (Optional[int]): maximum encoder cache sequence length.
            decoder_max_seq_len (Optional[int]): maximum decoder cache sequence length.
        c              3   @   K   | ]}t          |t                    V  d S r   )r   rP   ro   ms     r   	<genexpr>z2TransformerDecoder.setup_caches.<locals>.<genexpr>  s>       !
 !
>?Jq899!
 !
 !
 !
 !
 !
r   c              3   @   K   | ]}t          |t                    V  d S r   )r   r   r   s     r   r   z2TransformerDecoder.setup_caches.<locals>.<genexpr>  s>       !
 !
=>Jq788!
 !
 !
 !
 !
 !
r   Nr   )anymodulesr   r"   r   rz   r&   )r   r   r   r   r    has_encoder_layershas_decoder_layerslayers           r   r&   zTransformerDecoder.setup_caches  s	   ( ! !
 !
CG<<>>!
 !
 !
 
 
 ! !
 !
BF,,..!
 !
 !
 
 
  	B".1D..151A. 	B".1D..151A.[ 	 	E$($B$($B	     	 	r   c                 @    | j         d                                         S )z
        Check if the key value caches are setup. This means ``setup_caches`` has been called, and
        the relevant attention modules in the model have created their ``KVCache``.
        r   )rz   r,   r+   s    r   r,   z#TransformerDecoder.caches_are_setup  s    
 {1~..000r   c                 @    | j         d                                         S )a  
        Checks if the key value caches are enabled. Once KV-caches have been setup, the relevant
        attention modules will be "enabled" and all forward passes will update the caches. This behaviour
        can be disabled without altering the state of the KV-caches by "disabling" the KV-caches
        using :func:`torchtune.modules.common_utils.disable_kv_cache`, upon which ``caches_are_enabled`` would return False.
        r   )rz   r2   r+   s    r   r2   z%TransformerDecoder.caches_are_enabled  s     {1~00222r   c                     |                                  st          d          | j        D ]}|                                 dS )aX  
        Resets KV-cache buffers on relevant attention modules to zero, and reset cache positions to zero,
        without deleting or reallocating cache tensors.

        Raises:
            RuntimeError: if KV-caches are not setup. Use :func:`~torchtune.modules.TransformerDecoder.setup_caches` to
                setup caches first.
        z>Key value caches are not setup. Call model.setup_caches first.N)r2   RuntimeErrorrz   r7   )r   r   s     r   reset_cacheszTransformerDecoder.reset_caches  s_     &&(( 	P   [ 	  	 E	  	 r   last_hidden_statec                 T      fd|                      j        d          D             S )a  
        Apply output projection in chunks. This should be applied in conjunction with
        :class:`~torchtune.modules.loss.CEWithChunkedOutputLoss` as upcasting to fp32 is done there.

        To use this method, you should first call
        :func:`~torchtune.modules.TransformerDecoder.set_num_output_chunks`.

        Args:
            last_hidden_state (torch.Tensor): last hidden state of the decoder, having shape
                [b, seq_len, embed_dim].

        Returns:
            List[torch.Tensor]: List of num_chunks output tensors, each with shape
                [b, seq_len/num_chunks, out_dim], where out_dim is usually the vocab size.
        c                 :    g | ]}                     |          S rl   )r~   )ro   chunkr   s     r   rq   z5TransformerDecoder.chunked_output.<locals>.<listcomp>  s5     
 
 
 KK
 
 
r      )r\   )r   r   )r   r   s   ` r   chunked_outputz!TransformerDecoder.chunked_output  sD    "
 
 
 
*001GQ0OO
 
 
 	
r   seq_lenr:   ra   rb   r;   c                     || j         k    rt          d| d| j          d          |                                 r5|t          d          ||t          d          |t          d          dS dS )a  
        Validates inputs for ``forward``.
        Args:
            seq_len (int): Input tensor sequence length.
            mask (Optional[torch.Tensor]): Attention mask used for inference and for sequence packing.
            encoder_input (Optional[torch.Tensor]): Encoder input for cross-attention.
            encoder_mask (Optional[torch.Tensor]): Encoder attention mask for cross-embedding attention.
            input_pos (Optional[torch.Tensor]): Input tensor position IDs.

        Raises:
            ValueError:
                If seq_len of x is bigger than max_seq_len, **or**
                if the model has caches which have been setup with self-attention layers and ``mask`` is not provided, **or**
                if the model has caches which have been setup with encoder layers and ``encoder_mask`` is not provided, **or**
                if the model has caches which have been setup ``input_pos`` is not provided.
        z	seq_len (z6) of input tensor should be smaller than max_seq_len ()NzKV-caches for self-attention layers are setup for inference mode, causal masks must be provided! Use the `mask` arg to provide a causal mask.zKV-caches for cross-attention/fusion layers are setup for inference mode and you seem to be using encoder_input, causal masks must be provided! Use the `encoder_mask` arg to provide a causal mask.zIKV-caches are setup for inference mode, input positions must be provided!)r"   
ValueErrorr2   )r   r   r:   ra   rb   r;   s         r   _validate_inputsz#TransformerDecoder._validate_inputs  s    2 T%%%9G 9 9%)%59 9 9  
 ""$$ 	| D  
 (\-A z  
   _  	 	 ! r   r:   ra   rb   r;   tokensc                P   |j         d         }|                     |||||           |                     |          }g }t          | j                  D ]3\  }	}
|	| j        v r|                    |            |
|||||          }4|                     |          }|s|ng ||}|S )a  
        Args:
            tokens (torch.Tensor): input tensor with shape ``[b x s]``
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax. This parameter is required during inference if caches have been setup.
                Either:

                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
                is used by default.

                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
                Default is None.
            encoder_input (Optional[torch.Tensor]): Optional input embeds from the encoder. Shape ``[b x s_e x d_e]``
            encoder_mask (Optional[torch.Tensor]):  Boolean tensor defining a relational matrix between
                tokens and encoder embeddings. A True value at position ``i,j`` means token ``i`` can attend
                to embedding ``j`` in the decoder. Mask has shape ``[b x s x s_e]``. Default is None,
                but this is required during inference if the model has been setup with any layers
                which use encoder embeddings and caches have been setup.
            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
                of each token. During training, this is used to indicate the positions
                of each token relative to its sample when packed, shape ``[b x s]``.
                During inference, this indicates the position of the current token.
                This parameter is required during inference if caches have been setup. Default is None.

        Returns:
            Union[torch.Tensor, List[torch.Tensor]]: output tensor with shape ``[b x s x v]`` or a list of layer
                output tensors defined by ``output_hidden_states`` with the
                final output tensor appended to the list.

        Note:
            At the very first step of inference, when the model is provided with a prompt,
            ``input_pos`` should contain the positions of all of the tokens in the prompt.
            For a single-batch prompt, or a batch of prompts with identical lengths, this
            will be ``torch.arange(prompt_length)``. For a batch of varying-length prompts,
            shorter prompts are left-padded and position ids are correspondingly right-shifted,
            thus positional ids should be of shape ``[b, padded_prompt_length]``.
            This is because we will need to retrieve the positional embeddings for each input id.
            In the subsequent steps, if the model has been setup with KV-caches, ``input_pos`` will contain
            the position(s) of the current token(s) ``torch.tensor([padded_prompt_length])``. Otherwise,
            ``input_pos`` will contain all the position ids up to the current token.

        Shape notation:
            - b: batch size
            - s: token sequence length
            - s_e: encoder sequence length
            - v: vocab size
            - d: token embed dim
            - d_e: encoder embed dim
            - m_s: max seq len
        r   r   )shaper   ry   	enumeraterz   rx   appendunembed)r   r   r:   ra   rb   r;   r   r?   hiddenrp   r   r~   s               r   rC   zTransformerDecoder.forward(  s    B ,q/'% 	 	
 	
 	
 ''!$+.. 
	 
	HAuD---a   +)#  AA a  &<+<V+<V+<r   c                     |                      |          }| j        dk    r|                     |          }n'|                     |                                          }|S )Nr   )r}   r   r   r~   float)r   r?   r~   s      r   r   zTransformerDecoder.unembed  sV    IIaLL!A%%((++FF [[^^))++Fr   )NNNN) rD   rE   rF   rG   r   	Embeddingr   rH   r   rr   rI   Linearr   r   r   r   rJ   r   r&   rK   r,   r2   r   compilerdisablerL   r   r   r
   rC   r   rM   rN   s   @r   rv   rv   B  s          X %)48&. &. &. &. bibi"-?@	&.
 &. &. &. i&. bi)*&. SM&. 'tCy1&. 
&. &. &. &. &. &.P3s 3t 3 3 3 3 .2-1, , ,, {,
 &c], &c], , , ,\1$ 1 1 1 13D 3 3 3 3     " ^
 
elAS 
 
 
 
0 (,04/3,0/ // u|$/  -	/
 u|,/ EL)/ / / /j %)04/3,0a a aa y!	a
  -a u|,a EL)a 
u|T%,//	0a a a aF
 
 
 
 
 
 
r   rv   )rm   typingr   r   r   r   r   rJ   r   torchtune.modulesr	   !torchtune.modules.attention_utilsr
   rH   r   rP   rI   rr   rt   rv   rl   r   r   <module>r      sW    8 8 8 8 8 8 8 8 8 8 8 8 8 8        0 0 0 0 0 0 7 7 7 7 7 7u u u u uBI u u upi i i i iRY i i iXD	 Dc Dbm D D D DS S S S S S S S S Sr   