
    Pi9              0       z   d dl Z d dl mZ d dlmZ d dlmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlmZmZ 	  G d dej                  Z G d dej                  Z	 	 	 	 	 	 	 d5dedededededededededed ed!ed"ed#ed$ee         d%ef d&Z	 d6ddddddddd'd'd(
d)ee         d*e dedededededededededed ed!ed"ed#ed$ee         d+ed,ed-ed.e d/e d%ef.d0Z!dddddd'd'd1d2ee         dedededededed ed#ee         d3ee         d$ee         d+ed,ed-ed.e d/e d%ef"d4Z"dS )7    N)nn)(_register_reparametrize_state_dict_hooks)ListOptional)FrozenNF4LinearRotaryPositionalEmbeddingsTransformerSelfAttentionLayer)Gemma2Attention)GemmaRMSNorm)TransformerDecoder
TiedLinear)GemmaNormEmbeddings)
DoRALinearLORA_ATTN_MODULES
LoRALinear)	gemma_mlplora_gemma_mlpc                   .     e Zd Zdeddf fdZd Z xZS )TanhSoftCappingcapping_valuereturnNc                 V    t                                                       || _        d S N)super__init__r   )selfr   	__class__s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/gemma2/_component_builders.pyr   zTanhSoftCapping.__init__&   s)     	*    c                 V    || j         z  }t          j        |          }|| j         z  }|S r   )r   torchtanh)r   attn_weightss     r   forwardzTanhSoftCapping.forward-   s1    #d&88z,//#d&88r   )__name__
__module____qualname__floatr   r$   __classcell__r   s   @r   r   r   %   s^        ++ 
+ + + + + +      r   r   c                   :     e Zd ZdZdedededdf fdZd Z xZS )	Gemma2FinalNormz*
    Combines RMSNorm and SoftCapping
    r   	embed_dimepsr   Nc                     t                                                       || _        t          ||          | _        t          |          | _        d S )Nr.   )r   r   r   r   rms_normr   logit_capping)r   r   r-   r.   r   s       r   r   zGemma2FinalNorm.__init__7   sM     	*$YC888,];;r   c                 Z    |                      |          }|                     |          }|S r   )r1   r2   )r   xs     r   r$   zGemma2FinalNorm.forwardB   s+    MM!q!!r   )	r%   r&   r'   __doc__r(   intr   r$   r)   r*   s   @r   r,   r,   3   s|         	<	< 	< 		<
 
	< 	< 	< 	< 	< 	<      r   r,           ư>'        I@      >@   
vocab_size
num_layers	num_headshead_dimnum_kv_headsr-   intermediate_dimmax_seq_lenattn_dropoutnorm_eps	rope_basehidden_capping_valuefinal_capping_valuesliding_window_sizequery_pre_attn_scalarr   c                    t          |||
          }t          j                                        }t	          |          D ]}t          ||          }t          ||||t          j        |||z  d          t          j        |||z  d          t          j        |||z  d          t          j        ||z  |d          |d|||dz  dk    r|nd||          }t          ||t          ||		          t          ||		          t          ||		          t          ||		          
          }|
                    |           t          | |          }t          |          }t          ||||||t          |||		                    }|S )a  
    Build the decoder associated with the gemma2 model. This includes:
    - Token embeddings
    - num_layers number of TransformerSelfAttentionLayer blocks
    - RMS Norm layer applied to the output of the transformer
    - Final projection into token space


    Args:
        vocab_size (int): number of tokens in vocabulary.
        num_layers (int): number of layers in the transformer decoder.
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        head_dim (int): dimension of head
        num_kv_heads (int): number of key and value heads.
        embed_dim (int): embedding dimension for self-attention
        intermediate_dim (int): intermediate dimension for MLP
        max_seq_len (int): maximum sequence length the model will be run with,
        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
            Default: 0.0
        norm_eps (float): epsilon in RMS norms Default: 1e-6
        rope_base (int): base for the rotary positional embeddings. Default: 10_000

    Returns:
        TransformerDecoder: Instantiation of gemma model.
    dimrC   base)rM   
hidden_dimFbiasN   r   r-   r?   rA   r@   q_projk_projv_projoutput_projpos_embeddingskv_cacherC   rD   rI   softcappingrJ   r0   attnmlpsa_normmlp_normsa_scale	mlp_scaletok_embeddingslayersrC   r?   outputr@   norm)r   r!   r   
ModuleListranger   r
   Linearr	   r   appendr   r   r   r,   )r=   r>   r?   r@   rA   r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   roperd   	layer_idxr]   self_attlayerrc   rW   models                           r   gemma2rp   H   s   V &(R[\\\DX  ""F:&&  	I2BCCC"%9Y	H(<5III9Yx(?eLLL9Yx(?eLLL	)h"6	NNN#%8AA7I7I 3 3t,"7!
 
 
& . 999!):::!):::"9(;;;
 
 
 	e(Y??N^,,K%0)JJJ  E Lr   F)
rD   rE   rF   rG   rH   rI   rJ   lora_dropoutuse_doraquantize_baselora_attn_modulesapply_lora_to_mlp	lora_rank
lora_alpharq   rr   rs   c                   t          ||          }t          |          }t          j                    }t	          |          D ]}|rt          |||||||          }nt          |||          }t          di d| d|d|d|d|d|d	|	d
|
d|dz  dk    r|ndd|d|d|d|d|d|d|}t          ||t          ||          t          ||          t          ||          t          ||                    }|
                    |           t          |||	|||t          |||                    }|rt          ||j        j                   |S )ad  
    Return a version of Gemma with LoRA applied based on the passed in configuration.
    Note: output projection lora is not supported because it is tied to token embeddings

    Args:
        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
            LoRA should be applied to in each self-attention block. Options are
            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
            Default: False
        vocab_size (int): number of tokens in vocabulary.
        num_layers (int): number of layers in the transformer decoder.
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        head_dim (int): dimension of head
        num_kv_heads (int): number of key and value heads.
        embed_dim (int): embedding dimension for self-attention
        intermediate_dim (int): intermediate dimension for MLP
        max_seq_len (int): maximum sequence length the model will be run with,
        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
            Default: 0.0
        norm_eps (float): epsilon in RMS norms Default: 1e-6
        rope_base (int): base for the rotary positional embeddings. Default: 10_000
        lora_rank (int): rank of each low-rank approximation
        lora_alpha (float): scaling factor for the low-rank approximation
        lora_dropout (float): LoRA dropout probability. Default: 0.0
        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
        quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
            weights within linear layers LoRA is applied to. The final output linear projection is not
            supported for quantization currently.

    Returns:
        TransformerDecoder: Instantiation of Gemma model with LoRA applied to
        a subset of the attention projections in each layer.
    )rM   rO   rv   rw   rq   rr   rs   )rM   rO   rs   lora_modulesr-   r?   rA   r@   rF   rC   rD   rI   rR   r   NrZ   rJ   rv   rw   rq   rr   rs   r0   r[   rb   )dtype )r   r   r   rg   rh   r   r   lora_gemma2_self_attentionr	   r   rj   r   r,   r   weightrz   )rt   ru   r=   r>   r?   r@   rA   r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rv   rw   rq   rr   rs   rc   rW   rd   rl   r]   rm   rn   ro   s                                 r   lora_gemma2r~      sM   @ )Y??N^,,K]__F:&& ) )	 	e +#%)!+  CC 	6FVcdddC- 
 
 
**
i
  i
 &	

 X
  i
 $
 &
 9BA7I7I 3 3t
 -,
 #8"7
  i
 "z
 &
   x!
" *M#
( . 999!):::!):::"9(;;;
 
 
 	e%0)JJJ  E  [
 	1n>S>YZZZZLr   )rD   rF   rI   rZ   rq   rr   rs   ry   rZ   c                   | st          dt           d          |r|n|}|rt          nt          }d| v r ||||z  ||||          n0|st	          j        |||z  d          nt          |||z  d          }d| v r ||||z  ||||          n0|st	          j        |||z  d          nt          |||z  d          }d| v r ||||z  ||||          n0|st	          j        |||z  d          nt          |||z  d          }d	| v r |||z  |||||          n0|st	          j        ||z  |d          nt          ||z  |d          }t          |||
          }t          |||||||||d ||||	|
          }|S )NzMust pass one or more of z as lora_modulesrT   )rankalphadropoutrs   FrP   rU   rV   rW   rL   rS   )	
ValueErrorr   r   r   r   ri   r   r   r
   )ry   r-   r?   r@   rA   rC   rD   rF   rI   rZ   rJ   rv   rw   rq   rr   rs   adapter_clsrT   rU   rV   rW   rk   rm   s                          r   r|   r|   )  s   ,  
K(9KKK
 
 	
 $0><<YL (8**jK |## 	  '	
 	
 	
 	
 !NBIiX!5EBBBB I,@uMMM 2 |## 	8# '	
 	
 	
 	
 !QBIi!8uEEEE L8,C%PPP 2 |## 	8# '	
 	
 	
 	
 !QBIi!8uEEEE L8,C%PPP 2 L(( 	  '	
 	
 	
 	
 !NBIi(*IEBBBB X!5yuMMM " &(R[\\\D%##% 3#"7
 
 
H" Or   )r7   r8   r9   r:   r;   r<   N)F)#r!   r   torchtune.modules.common_utilsr   typingr   r   torchtune.modulesr   r   r	   "torchtune.models.gemma2._attentionr
   torchtune.models.gemma.rms_normr   r   r   +torchtune.models.gemma.gemma_norm_embeddingr   torchtune.modules.peftr   r   r   *torchtune.models.gemma._component_buildersr   r   Moduler   r,   r6   r(   rp   boolr~   r|   r{   r   r   <module>r      s          S S S S S S ! ! ! ! ! ! ! !          ? > > > > > 8 8 8 8 8 8 < < < < < < < < K K K K K K L L L L L L L L L L P P P P P P P P
    bi       bi   < "%!$#,0Z ZZZ Z 	Z
 Z Z Z Z Z Z Z  Z Z Z %SMZ  !Z Z Z Z@ $@ "%!$#,0 3@ @ @-.@@
 @ @ @ @ @ @ @ @ @ @ @   !@" #@$ %@& %SM'@* +@, -@. /@0 1@2 3@4 5@ @ @ @X )-#&
 'r r r()r 	r
 r r r r r r "#r %r $C=r r  !r" #r$ %r& 'r* +r r r r r rr   