
    PiKB              (          d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZmZmZ 	 	 	 	 	 d*dededededededededee         dedefdZd+dedededefdZ	 	 d,d	dd
ddddddee         dededededededededee         dededed ed!ed"ed#ededef&d$Zdddddd%d&ee         dedededededed ed!ed"eded#ede
fd'Zdddd(deded ed!ed"eded#edefd)Zd	S )-    )ListOptional)nn)scale_hidden_dim_for_mlp)FeedForwardFrozenNF4LinearMultiHeadAttentionRMSNormRotaryPositionalEmbeddingsTransformerDecoderTransformerSelfAttentionLayer)(_register_reparametrize_state_dict_hooks)
DoRALinearLORA_ATTN_MODULES
LoRALinear          Nh㈵>
vocab_size
num_layers	num_headsnum_kv_heads	embed_dimmax_seq_lenattn_dropout	rope_baseintermediate_dimnorm_epsreturnc
                    ||z  }
|r|n|}|r|nt          |          }t          |
||          }t          j                    }t	          |          D ]}t          ||||
t          j        |||
z  d          t          j        |||
z  d          t          j        |||
z  d          t          j        ||d          |||          }t          ||          }t          ||t          ||	          t          ||	                    }|
                    |           t          j        | |          }t          j        || d          }t          |||||
t          ||	          |	          S )
a  
    Build the decoder associated with the Llama3 model. This includes:
    - Token embeddings
    - num_layers number of TransformerSelfAttentionLayer blocks
    - RMS Norm layer applied to the output of the transformer
    - Final projection into token space

    Args:
        vocab_size (int): number of tokens in vocabulary.
        num_layers (int): number of layers in the transformer decoder.
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
        embed_dim (int): embedding dimension for self-attention
        max_seq_len (int): maximum sequence length the model will be run with, as used
            by :func:`~torchtune.modules.KVCache`
        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
            Default: 0.0
        rope_base (int): base for the rotary positional embeddings. Default: 500_000
        intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
            this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
        norm_eps (float): epsilon in RMS norms.

    Returns:
        TransformerDecoder: Instantiation of Llama3 model.
    dimr   baseFbiasr   r   r   head_dimq_projk_projv_projoutput_projpos_embeddingsr   r   )r"   
hidden_dimr"   epsattnmlpsa_normmlp_normr/   tok_embeddingslayersr   r   r'   normoutput)r   r   r   
ModuleListranger	   Linear
llama3_mlpr   r
   append	Embeddingr   )r   r   r   r   r   r   r   r   r   r   r'   r-   roper8   _	self_attnr2   layerr7   r+   s                       /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/llama3/_component_builders.pyllama3rF   +   s   P I%H#/><<YL,U2J92U2U  &+I  D ]__F:  &%9Y	H(<5III9Yx(?eLLL9Yx(?eLLL	)YUCCC#%
 
 
	 Y:>>>-	x888999	
 
 
 	e\*i88N)Iz>>>K%YH---       Fr"   r-   quantize_basec                 (   |st          j        | |d          nt          | |d          }|st          j        || d          nt          || d          }|st          j        | |d          nt          | |d          }t          |||          S )z>
    Build the MLP layer associated with the Llama model.
    Fr$   	gate_proj	down_projup_proj)r   r=   r   r   )r"   r-   rH   rK   rL   rM   s         rE   r>   r>      s     	:	#z....S*5999  	:	*c....Z5999  	:	#z....S*5999 
 iQQQQrG   )r   r   r   r   lora_dropoutuse_dorarH   lora_attn_modulesapply_lora_to_mlpapply_lora_to_output	lora_rank
lora_alpharN   rO   c                   |	r|	nt          |          }t          j                    }t          |          D ]}t	          | |||||
||||||          }|rt          |||||||          }nt          |||          }t          ||t          ||          t          ||                    }|	                    |           t          j
        ||          }|rt          nt          }|r ||||||          nt          j        ||d          }t          ||||||z  t          ||	          |
          }|rt          |           |S )a	  
    Return a version of Llama3 (an instance of :func:`~torchtune.modules.TransformerDecoder`)
    with LoRA applied based on the passed in configuration.

    Args:
        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
            LoRA should be applied to in each self-attention block. Options are
            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
            Default: False
        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
            Default: False
        vocab_size (int): number of tokens in vocabulary.
        num_layers (int): number of layers in the transformer decoder.
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
        embed_dim (int): embedding dimension for self-attention
        max_seq_len (int): maximum sequence length the model will be run with, as used
            by :func:`~torchtune.modules.KVCache`
        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
            Default: 0.0
        intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
            this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
        norm_eps (float): epsilon in RMS norms.
        lora_rank (int): rank of each low-rank approximation
        lora_alpha (float): scaling factor for the low-rank approximation
        lora_dropout (float): LoRA dropout probability. Default: 0.0
        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
        quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
            weights within linear layers LoRA is applied to. The final output linear projection is not
            supported for quantization currently.

    Returns:
        TransformerDecoder: Instantiation of Llama3 model with LoRA applied to
        a subset of the attention projections in each layer.

    )lora_modulesr   r   r   r   r   r   rS   rT   rN   rH   rO   )r"   r-   rS   rT   rH   rN   rO   )r"   r-   rH   r.   r0   )rankalphadropoutFr$   r5   r6   )r   r   r;   r<   lora_llama3_self_attentionlora_llama3_mlpr>   r   r
   r?   r@   r   r   r=   r   r   )rP   rQ   rR   r   r   r   r   r   r   r   r   r   r   rS   rT   rN   rO   rH   r-   r8   rB   rC   r2   rD   r7   adapter_clsr+   models                               rE   lora_llama3r^      s   D -U2J92U2U  ]__F: % %.*%#%!%'
 
 
	  	!%#%+)!  CC *M  C .	x888999	
 
 
 	e\*i88N !)8**jK  	: 	
 	
 	
 	
 Yy*5999  %y(YH---  E  8 	1777LrG   )r   r   rN   rH   rO   rV   c                   | st          dt           d          ||z  }|r|n|}|rt          nt          }d| v r ||||z  |||	|
          n0|
st	          j        |||z  d          nt          |||z  d          }d| v r ||||z  |||	|
          n0|
st	          j        |||z  d          nt          |||z  d          }d| v r ||||z  |||	|
          n0|
st	          j        |||z  d          nt          |||z  d          }d	| v r ||||||	|
          n*|
st	          j        ||d          nt          ||d          }t          |||
          }t          |||||||||||          }|S )a  
    Return an instance of :func:`~torchtune.modules.MultiHeadAttention` with LoRA
    applied to a subset of its linear layers

    Args:
        lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers
            LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj",
            "output_proj"}``.
        embed_dim (int): embedding dimension for self-attention
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
        max_seq_len (int): maximum sequence length the model will be run with, as used
            by :func:`~torchtune.modules.KVCache`
        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
            Default: 0.0
        lora_rank (int): rank of each low-rank approximation
        lora_alpha (float): scaling factor for the low-rank approximation
        lora_dropout (float): LoRA dropout probability. Default: 0.0
        quantize_base (bool): Whether to quantize base model parameters for linear layers
            LoRA is being applied to. Default is ``False``.
        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).

    Returns:
        MultiHeadAttention: instantiation of self-attention module with LoRA
        applied to a subset of Q, K, V, output projections.

    Raises:
        ValueError: If lora_modules arg is an empty list
    zMust pass one or more of z as lora_modulesr(   )rW   rX   rY   rH   Fr$   r)   r*   r+   r!   r&   )	
ValueErrorr   r   r   r   r=   r   r   r	   )rV   r   r   r   r   r   r   rS   rT   rN   rH   rO   r'   r\   r(   r)   r*   r+   rA   rC   s                       rE   rZ   rZ   (  s   d  
K(9KKK
 
 	
 I%H#/><<YL (8**jK |## 	  '	
 	
 	
 	
 !NBIiX!5EBBBB I,@uMMM 2 |## 	8# '	
 	
 	
 	
 !QBIi!8uEEEE L8,C%PPP 2 |## 	8# '	
 	
 	
 	
 !QBIi!8uEEEE L8,C%PPP 2 L(( 	 '	
 	
 	
 	
 !CBIi7777 IEBBB   &+I  D #!!  I rG   )rN   rH   rO   c                     |rt           nt          } || |||||          } ||| ||||          }	 || |||||          }
t          ||	|
          S )N)in_dimout_dimrW   rX   rY   rH   rJ   )r   r   r   )r"   r-   rS   rT   rN   rH   rO   r\   rK   rL   rM   s              rE   r[   r[     s     !)8**jK#  I #  I k#  G    rG   )r   r   Nr   )F)FF)typingr   r   torchr   $torchtune.models.llama3._model_utilsr   torchtune.modulesr   r   r	   r
   r   r   r   torchtune.modules.common_utilsr   torchtune.modules.peftr   r   r   intfloatrF   boolr>   r^   rZ   r[    rG   rE   <module>rn      s(   " ! ! ! ! ! ! !       I I I I I I                  T S S S S S L L L L L L L L L L
. &*S SSS S 	S
 S S S S smS S S S S SlR RC RS R R+ R R R R6 $!&K '+ -K K K-.KK K K K K K K K smK K K K" #K$ %K& 'K( )K, -K. /K K K Kl  J J J()J 	J
 J J J J J J J J J J  !J J J Jf ' ' '	' ' 	'
 ' ' ' ' ' ' ' ' ' 'rG   