
    Pid/                        d dl Z d dlmZmZmZ d dlZd dlmc mZ	 d dlmZm
Z
 d dlmZ d dlmZ  G d dej                  Z G d d	ej                  Z G d
 dej                  Z G d dej                  Zdededede
fdZdS )    N)ListOptionalUnion)nnTensor)MultiHeadAttention)_get_clonesc                        e Zd ZdZdddej        deej        eej                 ej	        f         dej        de
de
d	e
d
e
dee
         ddf fdZdedefdZ xZS )	T5Encodera  
    The T5 encoder module.

    T5 paper: https://arxiv.org/abs/1910.10683

    Args:
        token_embedding (nn.Embedding): PyTorch embedding layer to place tokens in an embedding space.
        layers (Union[nn.Module, List[nn.Module], nn.ModuleList]): A single encoder layer.
        final_norm (nn.Module): Module that applies normalization to the output of the encoder
        num_heads (int): The number of attention heads.
        rel_pos_num_buckets (int): Number of discrete buckets to divide the relative positions into.
            See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`
        rel_pos_max_dist (int): Maximum distance for relative positions.
            Distances beyond this are grouped into the last bucket.
            See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`
        max_seq_len (int): The maximum sequence length (context length) of the model.
        num_layers (Optional[int]): Number of encoder layers, only define when layers is not a list.

    Raises:
        AssertionError:
            If ``num_layers`` is set and layer is a list, **or**
            ``num_layers`` is not set and layer is an ``nn.Module``.

    N)
num_layerstoken_embeddinglayers
final_norm	num_headsrel_pos_num_bucketsrel_pos_max_distmax_seq_lenr   returnc                   t                                                       || _        || _        || _        t          ||||          | _        d | _        t          |t          j
                  r	|| _        d S t          |t                    rt          j
        |          | _        d S t          |t          j                  st          d          |t          d          t          ||          | _        d S )N)num_bucketsmax_distr   r   z.num_layers is defined, layers must be a modulez0num_layers is not defined, layers must be a list)super__init__r   r   r   T5EncoderRelativePositionBiasrelative_position_biasr   
isinstancer   
ModuleListlistModuleAssertionErrorr	   )
selfr   r   r   r   r   r   r   r   	__class__s
            p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/t5/_encoder.pyr   zT5Encoder.__init__+   s     	.$&&C+%#	'
 '
 '
# fbm,, 		: DKKK%% 	:-//DKKKfbi00 W$%UVVV!$%WXXX%fj99DKKK    tokensc                 
   |j         \  }}|| j        k    rt          d| d| j         d          |                     |          }|                                 }| j        D ]} |||          }|                     |          S )a  
        Args:
            tokens (Tensor): input tensor with shape ``[bsz, max_seq_len]``

        Returns:
            Tensor: output tensor with shape [bsz, max_seq_len, embed_dim]

        Raises:
            ValueError: if seq_len of tokens is bigger than max_seq_len
        z	seq_len (z6) of input tensor should be smaller than max_seq_len ())shaper   
ValueErrorr   r   r   r   )r!   r%   bszseq_lenxrel_pos_biaslayers          r#   forwardzT5Encoder.forwardN   s     |WT%%%9G 9 9%)%59 9 9     (( 2244 [ 	' 	'Ea&&AAq!!!r$   )__name__
__module____qualname____doc__r   	Embeddingr   r   r   r   intr   r   r   r/   __classcell__r"   s   @r#   r   r      s         F %)!: !: !: !: bibi"-?@	!:
 I!: !: !!: !: !: SM!: 
!: !: !: !: !: !:F"f " " " " " " " " "r$   r   c            
       j     e Zd ZdZdedej        dej        dej        ddf
 fdZd	ed
edefdZ	 xZ
S )T5EncoderLayeray  
    Single layer of the T5 encoder (standard transformer layer with relative position bias).

    Args:
        attn (MultiHeadAttention): Attention module.
        mlp (nn.Module): Feed-forward module.
        sa_norm (nn.Module): Normalization to be applied before self-attention.
        mlp_norm (nn.Module): Normalization to be applied before the feed-forward layer.
    attnmlpsa_normmlp_normr   Nc                     t                                                       || _        || _        || _        || _        d S )N)r   r   r:   r;   r<   r=   )r!   r:   r;   r<   r=   r"   s        r#   r   zT5EncoderLayer.__init__y   s;     		 r$   r,   r-   c                     ||                      |                     |          |          z   }||                     |                     |                    z   }|S )  
        Args:
            x (Tensor): input tensor with shape [bsz, seq_len, embed_dim]
            rel_pos_bias (Tensor): relative position bias with shape [1, num_heads, max_seq_len, max_seq_len]
                See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`

        Returns:
            Tensor: output tensor with shape [bsz, seq_len, embed_dim]
        )r:   r<   r;   r=   )r!   r,   r-   s      r#   r/   zT5EncoderLayer.forward   sM     		$,,q//<888q))***r$   )r0   r1   r2   r3   r   r   r   r   r   r/   r6   r7   s   @r#   r9   r9   n   s         ! ! Y! 	!
 )! 
! ! ! ! ! ! v &        r$   r9   c                   |     e Zd ZdZdedededej        dej        dej        dej        f fd	Zd
ededefdZ	 xZ
S )T5EncoderSelfAttentionaR  
    Self-attention for the T5 encoder.

    Standard self-attention with two differences:
        - No scaling factor
        - Add "relative position bias" to the attention scores.
            (See: :class:`~torchtune.models.t5._encoder.T5EncoderRelativePositionBias`)

    Args:
        embed_dim (int): The model dimension.
        num_heads (int): Number of attention heads.
        head_dim (int): Dimension of the attention heads (should equal `embed_dim // num_heads`)
        q_proj (nn.Module): Projection layer for query.
        k_proj (nn.Module): Projection layer for key.
        v_proj (nn.Module): Projection layer for value.
        output_proj (nn.Module): Projection layer for output.

    Raises:
        ValueError:
            If ``embed_dim % num_heads != 0``, **or**
            if ``embed_dim // num_heads != head_dim``
    	embed_dimr   head_dimq_projk_projv_projoutput_projc                    t                                                       ||z  dk    rt          d| d| d          ||z  |k    rt          d| d          || _        || _        || _        || _        || _        || _        d S )Nr   zembed_dim (z") must be divisible by num_heads (r'   z
head_dim (z)) must be equal to embed_dim // num_heads)	r   r   r)   r   rD   rE   rF   rG   rH   )	r!   rC   r   rD   rE   rF   rG   rH   r"   s	           r#   r   zT5EncoderSelfAttention.__init__   s     	y A%%+i + +'+ + +   	!X--PXPPP   # &r$   r,   r-   r   c                 `   |j         \  }}}|                     |          }|                     |          }|                     |          }|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }t          j	        ||                    dd                    }	|	|z  }	t          j        |	                                d                              |	j                  }
t          j	        |
|          }|                    dd                              |||          }|                     |          S )r@         )dim)r(   rE   rF   rG   viewr   rD   	transposetorchmatmulFsoftmaxfloattodtypereshaperH   )r!   r,   r-   r*   r+   rC   qkv
attn_scoreattn_weightattn_outs               r#   r/   zT5EncoderSelfAttention.forward   sn    #$'Wi KKNNKKNNKKNN FF3??II!QOOFF3??II!QOOFF3??II!QOO \!Q[[R%8%899
l"
i
 0 0 2 2;;;>>z?OPP<Q// %%a++33C)LL)))r$   )r0   r1   r2   r3   r5   r   r   r   r   r/   r6   r7   s   @r#   rB   rB      s         .'' ' 	'
 	' 	' 	' Y' ' ' ' ' ':* *v *& * * * * * * * *r$   rB   c                   @     e Zd ZdZdedededef fdZdefdZ xZS )	r   a-  
    Computes binned birectional relative position bias for the T5 encoder.

    It places relative positions into buckets and for each bucket, learns bias values for each attention head.

    Args:
        num_buckets (int): Number of discrete buckets to divide the relative positions into.
        max_dist (int): Maximum distance for relative positions (distances beyond this are grouped into the last bucket)
        num_heads (int): Number of attention heads in the transformer.
        max_seq_len (int): Maximum sequence length (context length).
    r   r   r   r   c                     t                                                       || _        t          j        ||          | _        |                     dt          |||          d           d S )Nrelative_position_to_bucketF)
persistent)r   r   r   r   r4   	embeddingregister_buffer#_calc_birectional_rel_pos_to_bucket)r!   r   r   r   r   r"   s        r#   r   z&T5EncoderRelativePositionBias.__init__   sv     	& k9== 	)/X{SS 	 	
 	
 	
 	
 	
r$   r   c                     |                      | j                  }|                    g d                              d          S )z
        Returns:
            torch.Tensor: relative position bias tensor with shape [1, num_heads, max_seq_len, max_seq_len]
        )rL   r   rK   r   )rd   rb   permute	unsqueeze)r!   r,   s     r#   r/   z%T5EncoderRelativePositionBias.forward	  s>     NN4;<< yy##--a000r$   )	r0   r1   r2   r3   r5   r   r   r/   r6   r7   s   @r#   r   r      s        
 


*-
:=
LO
 
 
 
 
 
 	1 	1 	1 	1 	1 	1 	1 	1 	1r$   r   r   r   r   r   c                    t          j        |t           j                  dddf         }t          j        |t           j                  dddf         }||z
  }t          j        |          }| dz  }|dz  }||k     }	|t          j        |                                |z            t          j        ||z            z  ||z
  z                      t           j                  z   }
t          j        |
t          j	        |
|dz
                      }
|dk                        t           j                  |z  t          j
        |	||
          z   }|S )a  
    Calculate the mapping from relative positions to bucket indices.

    NOTE: This is for the T5 encoder (birectional), not the decoder (unidirectional).

    Args:
        num_buckets (int): Number of discrete buckets to divide the relative positions into.
        max_dist (int): Maximum distance for relative positions (distances beyond this are grouped into the last bucket)
        max_seq_len (int): Maximum sequence length (context length).

    Returns:
        Tensor: shape=[max_seq_len, max_seq_len], range=[0, num_buckets]
    )rX   NrL   rK   r   )rR   arangelongabslogrV   mathrW   min	full_likewhere)r   r   r   query_positionskey_positionsrelative_positionsabs_relative_positionshalf_num_buckets	max_exactis_exactrelative_position_if_not_exactrb   s               r#   rf   rf     sb     l;ejAAA!!!T'JOLEJ???aaaHM&8"Y'9::
 #a' A%I%	1H &/	(..009<==
(8i'
(
(	)i'	) 	bnn	&"
 &+Y&68H18LMM& &" $6#9"="=
# ## ;(*H # '&r$   )ro   typingr   r   r   rR   torch.nn.functionalr   
functionalrT   r   torchtune.modulesr   torchtune.modules.transformerr	   r   r   r9   rB   r   r5   rf    r$   r#   <module>r      s    ( ( ( ( ( ( ( ( ( (                   0 0 0 0 0 0 5 5 5 5 5 5Z" Z" Z" Z" Z"	 Z" Z" Z"z$ $ $ $ $RY $ $ $NT* T* T* T* T*RY T* T* T*n&1 &1 &1 &1 &1BI &1 &1 &1R.'.' #.'25.'.' .' .' .' .' .'r$   