
    Pi                         d dl Z d dlmZ d dlZd dlmZmZ d dlmZ  G d dej                  Z	 G d dej                  Z
dS )	    N)Optional)nnTensor)	_MaskTypec                        e Zd ZdZdddddddej        d	ej        d
ededededef fdZdddede	e
         defdZ xZS )CLIPTextEncodera  
    Text encoder for CLIP.

    CLIP is a model that encodes text and images into a shared vector space.
    Blog post: https://openai.com/index/clip/
    Paper: https://arxiv.org/abs/2103.00020

    Args:
        layer (nn.Module): A single encoder layer.
        final_norm (nn.Module): Callable that applies normalization to the output of the encoder
        vocab_size (int): size of the vocabulary, default 49408
        max_seq_len (int): context size, default 77
        embed_dim (int): embedding/model dimension size, default 768
        num_layers (int): number of transformer layers, default 12
        eot_token (int): the id of the end-of-text token (for selecting the final output)
    i   M   i      i  )
vocab_sizemax_seq_len	embed_dim
num_layers	eot_tokenlayer
final_normr   r   r   r   r   c                d   t                                                       t          j        fdt	          |          D                       | _        || _        || _        || _        t          j	        ||          | _
        t          j        t          j        ||                    | _        d S )Nc                 8    g | ]}t          j                  S  )copydeepcopy).0ir   s     w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/clip/_text_encoder.py
<listcomp>z,CLIPTextEncoder.__init__.<locals>.<listcomp>.   s#    $U$U$UaT]5%9%9$U$U$U    )super__init__r   
ModuleListrangelayersr   r   r   	Embeddingtoken_embedding	Parametertorchemptyposition_embedding)	selfr   r   r   r   r   r   r   	__class__s	    `      r   r   zCLIPTextEncoder.__init__"   s     	m$U$U$U$U5CTCT$U$U$UVV$&"!|J	BB"$,u{;	/R/R"S"Sr   Nmasktokensr*   returnc                   |j         \  }}|| j        k    rt          d| d| j         d          |                     |          | j        z   }| j        D ]} |||          }|                     |          }|| j        k                                    	                    d          }|
                    |                    ddd          d                              d          }|S )a!  
        Args:
            tokens (Tensor): input tensor with shape ``[b x s]``
            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
                and before the softmax.
                Default is None.

        Returns:
            Tensor: output tensor with shape [b x d]

        Raises:
            ValueError: if seq_len of tokens is bigger than max_seq_len

        Shape notation:
            - b: batch size
            - s: token sequence length
            - d: token embed dim
        z	seq_len (z6) of input tensor should be smaller than max_seq_len ()r)   )dim   )shaper   
ValueErrorr"   r&   r    r   r   intargmaxtake_along_dimviewsqueeze)r'   r+   r*   bszseq_lenxr   eos_token_positionss           r   forwardzCLIPTextEncoder.forward6   s$   2 |WT%%%9G 9 9%)%59 9 9     ((4+BB [ 	 	E  AA OOA  &7<<>>EE"EMM055b!Q??QGGOOTUOVVr   )__name__
__module____qualname____doc__r   Moduler4   r   r   r   r   r=   __classcell__)r(   s   @r   r   r      s         ,  T T T yT I	T
 T T T T T T T T T T0 %)	0 0 00 y!	0
 
0 0 0 0 0 0 0 0r   r   c                   6    e Zd ZdZdej        dej        fdZdS )	QuickGELUz%
    Fast approximation of GELU.
    r;   r,   c                 6    |t          j        d|z            z  S )NgZd;?)r$   sigmoid)r'   r;   s     r   r=   zQuickGELU.forwardn   s    5=++++r   N)r>   r?   r@   rA   r$   r   r=   r   r   r   rE   rE   i   sD         , ,%, , , , , , ,r   rE   )r   typingr   r$   r   r   !torchtune.modules.attention_utilsr   rB   r   rE   r   r   r   <module>rJ      s                    7 7 7 7 7 7V V V V Vbi V V Vr, , , , ,	 , , , , ,r   