
    PiW              *          d dl mZ d dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d d	lmZmZ ej        d
ddddddddf
de de de de de dede de!de!deee                   de!de de de!de!def d Z"	 	 	 d?de de de d$e d%e d&e#fd'Z$	 d@d(e d)e d*e dej%        d+e!defd,Z&	 d@ej        d
dddddd-d.d/ddd0d1ee         d2e!de de de de de dede de!deee                   de!de de d3e d4e#d5e#d6e!d+e!def(d7Z'd/dd/ddd8d1ee         de d9e de d:e d;e#de!d3e d4e#d5e#d6e!d+e!defd<Z(d/ddd=d(e d)e d*e dej%        d3e d4e#d5e#d6e!d+e!defd>Z)dS )A    )partial)CallableListOptional)nn)TiledTokenPositionalEmbeddingTilePositionalEmbeddingTokenPositionalEmbedding)CLIPTextEncoder	QuickGELU)FeedForwardFp32LayerNormFrozenNF4LinearMultiHeadAttentionTransformerSelfAttentionLayer VisionRotaryPositionalEmbeddings)+reparametrize_as_dtype_state_dict_post_hook)
DoRALinearLORA_ATTN_MODULES
LoRALinear)CLSProjectionVisionTransformeri   TFN      	tile_size
patch_size	embed_dim
num_layers	num_heads
activationcls_output_dim	attn_biasuse_ropeout_indicesoutput_cls_projectionmax_num_tilesin_channelsappend_cls_tokenuse_tile_pos_embedreturnc                    ||z  dk    rt          d| d|           ||z  }|
rt          ||          nd}|rt          || ||dz  d|          nd}t          ||||t	          j        |||	          t	          j        |||	          t	          j        |||	          t	          j        |||	          |d
d          }t          |d|z  | |                      }t          ||t          |d          t          |d          dd          }|r<|dk    r6t          ||          }t          ||          }t          ||||           }nd}d}t          |||           }t          |||||||	| ||||          S )a1
  
    Builds the vision encoder associated with the clip model. This includes:

    - TransformerEncoderLayer
    - positional embeddings
    - CLS projection (optional)

    For details, please check the documentation of
    :class:`torchtune.modules.vision_transformer.VisionTransformer`.

    Args:
        tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise,
            the size of the input image. In this case, the function will consider your image as a single tile.
        patch_size (int): The size of each patch. Used to divide the tiles into patches.
            E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches
            with shape (40, 40) each.
        embed_dim (int): The dimensionality of each patch embedding (token).
        num_layers (int): The number of transformer layers.
        num_heads (int): The number of attention heads in each transformer layer.
        activation (Callable): The activation function to use in the MLP layer.
        cls_output_dim (int): The dimensionality of the output tensor from the CLS projection module.
        attn_bias (bool): Boolean for if to use bias in the attention module. Default True.
        use_rope (bool): If True, include 2D rope in attention in each transformer layer. Default: False
        out_indices (Optional[List[int]]): The indices of hidden layers to return.
            If provided, it will return the intermediate results of the transformer layers
            before they go through a next layer. For example, ``out_indices=[0,3]`` will
            return the tokens before they go through the first and fourth layers.
        output_cls_projection (bool): If True, only the CLS token projection will be outputted,
            instead of all tokens. Defaults to False.
        max_num_tiles (int): The maximum number of tiles that can be processed. This is used to
            determine the size of the positional embeddings.
        in_channels (int): The number of image input channels.
        append_cls_token (bool): If True, adds CLS token embedding to the end of the sequence in the vision transformer.
            Default is False, which adds CLS token to the beginning of the sequence.
        use_tile_pos_embed (bool): If True, use pre-tile, post-tile, and tiled token positional embeddings, if max_num_tiles > 1.
            If False, only use standard token positional embeddings.

    Returns:
        A `VisionTransformer` object.

    Raises:
        AssertionError: If ``embed_dim`` is not divisible by ``num_heads``.
    r   z.embed_dim must be divisible by num_heads, got z and r   r!   N   i'  )r   r   r&   dimbaser(   bias        F)r   r   num_kv_headshead_dimq_projk_projv_projoutput_projpos_embeddingsattn_dropout	is_causalr   )in_dim
hidden_dimout_dimr    h㈵>epsattnmlpsa_normmlp_normsa_scale	mlp_scale   r&   r   r&   r   r   r   r   r   r   )r   layertoken_pos_embeddingpre_tile_pos_embedpost_tile_pos_embedcls_projectionr$   r   r   r   r'   r(   )
ValueErrorr   r   r   r   Linearclip_mlpr   r   r	   r   r
   r   )r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r4   rQ   rope	self_attnrD   transformer_layerrO   rP   rN   s                           }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/clip/_component_builders.pyclip_vision_encoderrY      sl   x 9!!XYXXYXX
 
 	
 I%H !		.IIII  		(!'A-	
 	
 	
 	
  	 #yII>>>yII>>>yII>>>IiCCC  I y=:<<	  C 6iT222yd333    
ma//4'9
 
 
 6'9
 
 
 <'!	
 
 
 ""6J)
 
 
 /-/%)          M   r?   
vocab_sizemax_seq_lennorm_epsc                    t          | ||| |z  t          j        | |           t          j        | |           t          j        | |           t          j        | |                     }t          | | | dz  t	                                }t          ||t          j        | |          t          j        | |                    }t          j        | |          }	t          ||	||| |          S )ap  
    Text encoder for CLIP.

    CLIP is a model that encodes text and images into a shared vector space.
    Blog post: https://openai.com/index/clip/
    Paper: https://arxiv.org/abs/2103.00020

    Args:
        embed_dim (int): embedding/model dimension size
        num_heads (int): number of attention heads
        num_layers (int): number of transformer layers
        vocab_size (int): size of the vocabulary, default 49408
        max_seq_len (int): context size, default 77
        norm_eps (float): small value added to denominator for numerical stability, default 1e-5

    Returns:
        CLIPTextEncoder
    )r   r   r3   r4   r5   r6   r7   r8   r   )r<   r>   r=   r    r@   )rC   rD   rE   rF   )rM   
final_normr]   r^   r   r   )r   r   rS   rT   r   r   	LayerNormr   )
r   r   r   r]   r^   r_   rC   rD   encoder_layerra   s
             rX   clip_text_encoderrd      s   4 i'yI..yI..yI..Ii33	 	 	D q=;;	  C 2YH555iX666	  M iX666J   rZ   r<   r>   r=   quantize_basec                     |st          j        | |          nt          | |fddi|}|st          j        ||          nt          ||fddi|}t          ||d|          S )z=
    Build the MLP layer associated with the clip model.
    r1   TN	gate_proj	down_projup_projr    )r   rS   r   r   )r<   r>   r=   r    re   quantization_kwargsrh   ri   s           rX   rT   rT      s     	S	&*%%%VZRRdR>QRR  	T	*g&&&ZSStS?RSS 
 y$:   rZ         r2   )r    r!   r"   r$   r%   r&   r'   	lora_rank
lora_alphalora_dropoutuse_dorare   lora_modulesapply_lora_to_mlprn   ro   rp   rq   c                   ||z  dk    s
J d            |rt          ||          nd}t          d| |||||z  d||||||	d|}|r"t          d|d|z  | |            |||||d	|}nt          d|d|z  | |            |d	|}t	          ||t          |d
          t          |d
          dd          }|dk    rd}d}t          |||          }n5t          ||          }t          ||          }t          ||||          }t          |||||||
||||          }|r)|
                    t          t          d                     |S )a
  
    Build a LoRA implementation of the CLIP vision encoder.

    Args:
        lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers
            LoRA should be applied to in each self-attention block. Options are
            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
            Default: False
        tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise,
            the size of the input image. In this case, the function will consider your image as a single tile.
        patch_size (int): The size of each patch. Used to divide the tiles into patches.
            E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches
            with shape (40, 40) each.
        embed_dim (int): The dimensionality of each patch embedding (token).
        num_layers (int): The number of transformer layers.
        num_heads (int): The number of attention heads in each transformer layer.
        activation (Callable): The activation function to use in the MLP layer.
        cls_output_dim (int): The dimensionality of the output tensor from the CLS projection module.
        attn_bias (bool): Boolean for if to use bias in the attention module. Default False.
        out_indices (Optional[List[int]]): The indices of hidden layers to return.
            If provided, it will return the intermediate results of the transformer layers
            before they go through a next layer. For example, ``out_indices=[0,3]`` will
            return the tokens before they go through the first and fourth layers.
        output_cls_projection (bool): If True, only the CLS token projection will be outputted,
            instead of all tokens. Defaults to False.
        max_num_tiles (int): The maximum number of tiles that can be processed. This is used to
            determine the size of the positional embeddings.
        in_channels (int): The number of image input channels.
        lora_rank (int): rank of each low-rank approximation
        lora_alpha (float): scaling factor for the low-rank approximation
        lora_dropout (float): LoRA dropout probability. Default: 0.0
        use_dora (bool): Whether to use DoRA layers instead of LoRA layers. Default is ``False``.
        quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
            weights within linear layers LoRA is applied to. The final output linear projection is not
            supported for quantization currently.


    Returns:
        VisionTransformer: Instantiation of VisionTransformer model.
    r   z(embed_dim must be divisible by num_headsr,   Nr2   )rr   r   r   r3   r4   r:   rn   ro   rp   rq   re   r"   r   )	r<   r=   r>   r    rn   ro   re   rp   rq   )r<   r=   r>   r    re   r?   r@   rB   rI   rL   rJ   rK   )r   rM   rN   rO   rP   rQ   r$   r   r   r   r'   T)offload_to_cpu )r   lora_clip_attentionlora_clip_mlprT   r   r   r
   r	   r   r   _register_state_dict_hookr   r   )rr   rs   r   r   r   r   r   r    r!   r"   r$   r%   r&   r'   rn   ro   rp   rq   re   rk   rQ   rV   rD   rW   rO   rP   rN   models                               rX   lora_clip_vision_encoderr{     sr   D y A%%%'Q%%%
 !		.IIII  $ !i'!#   I  
 
9}!z||!'%
 
 "
 
  
9}!z||'
 
 "
 
 6iT222yd333   !"6J)
 
 
 5'9
 
 
 6'9
 
 
 <'!	
 
 
 /-/%  E  
 	''?PTUUU	
 	
 	
 LrZ   )r:   r"   rp   rq   re   r4   r3   r:   c                   | st          dt           d          |
rt          nt          }d| v r ||||z  f|||	|d|n.|st	          j        |||z  |          nt          |||z  fd|i|}d| v r ||||z  f|||	|d|n.|st	          j        |||z  |          nt          |||z  fd|i|}d| v r ||||z  f|||	|d|n.|st	          j        |||z  |          nt          |||z  fd|i|}d	| v r |||f|||	|d|n(|st	          j        |||          nt          ||fd|i|}t          ||||||||d
|
  
        }|S )a  
    Return an instance of :func:`~torchtune.modules.MultiHeadAttention` with LoRA
    applied to a subset of its linear layers

    Args:
        lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers
            LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj",
            "output_proj"}``.
        embed_dim (int): embedding dimension for self-attention
        head_dim (int): dimension of each head in the multihead attention. Usually
            computed as ``embed_dim // num_heads``.
        num_heads (int): number of query heads. For MHA this is also the
            number of heads for key and value
        num_kv_heads (int): number of key and value heads. User should ensure
            `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`,
            for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1.
        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
            Default: 0.0
        lora_rank (int): rank of each low-rank approximation
        lora_alpha (float): scaling factor for the low-rank approximation
        lora_dropout (float): LoRA dropout probability. Default: 0.0
        use_dora (bool): Whether to use DoRA layers instead of LoRA layers. Default is ``False``.
        quantize_base (bool): Whether to quantize base model parameters for linear layers
            LoRA is being applied to. Default is ``False``.

    Returns:
        MultiHeadAttention: instantiation of self-attention module with LoRA
        applied to a subset of Q, K, V, output projections.

    Raises:
        ValueError: If lora_modules arg is an empty list
    zMust pass one or more of z as lora_modulesr5   )rankalphadropoutre   r0   r1   r6   r7   r8   N)
r   r   r3   r4   r5   r6   r7   r8   r9   r:   )rR   r   r   r   r   rS   r   r   )rr   r   r4   r   r3   r:   r"   rn   ro   rp   rq   re   rk   adapter_clsr5   r6   r7   r8   rV   s                      rX   rw   rw     s   d  
K(9KKK
 
 	
 !)8**jK |## 	 	
  '	
 	
 "	
 	
 	
 !BIiX!5IFFFF 9x/ 6?CV  : |## 	8#	
  '	
 	
 "	
 	
 	
 !BIi!8yIIII x'   &	  @ |## 	8#	
  '	
 	
 "	
 	
 	
 !BIi!8yIIII x'   &	  @ L(( 		
  '	
 	
 "	
 	
 	
 !BIi;;;; 9 +48K  ( #!!  I rZ   )rp   rq   re   c        	             |rt           nt          }
 |
d| |||||dd|	} |
d||||||dd|	}t          ||d|          S )zQ
    Build the MLP layer with LoRA applied to the gate and down projections.
    T)r<   r>   r}   r~   r   re   use_biasNrg   rv   )r   r   r   )r<   r>   r=   r    rn   ro   rp   rq   re   rk   r   rh   ri   s                rX   rx   rx   O  s      !)8**jK 	#	 	 	 	I  	#	 	 	 	I y$:   rZ   )r[   r\   r?   )F)*	functoolsr   typingr   r   r   torchr   *torchtune.models.clip._position_embeddingsr   r	   r
   #torchtune.models.clip._text_encoderr   r   torchtune.modulesr   r   r   r   r   r   torchtune.modules.common_utilsr   torchtune.modules.peftr   r   r   $torchtune.modules.vision_transformerr   r   SiLUintboolrY   floatrd   ModulerT   r{   rw   rx   rv   rZ   rX   <module>r      s}         + + + + + + + + + +               
 K J J J J J J J                W V V V V V L L L L L L L L L L Q Q Q Q Q Q Q Q 7'+"'"#T TTT T 	T
 T T T T T $s)$T  T T T T T  !T T T Tv 8 888 8 	8
 8 8 8 8 8@     		
     > $d 7'+"'-d d d()dd
 d d d d d d d d $s)$d  d d  !d$ %d& 'd( )d* +d, -d0 1d d d d^  W W W()W 	W
 W W W W W W W W W W" #W W W WD ' ' '' ' 	'
 	' ' ' ' ' ' ' ' ' ' ' 'rZ   