
    Pi                     B    d dl mZmZmZ d dlmZ  G d de          ZdS )    )AnyDictList)SentencePieceBaseTokenizerc            	            e Zd ZdZddededef fdZded	ee         f fd
Z		 dde
eef         ded	e
eef         fdZ xZS )T5Tokenizerz
    Text tokenizer for T5.

    Args:
        path (str): the path to the T5 sentencepiece tokenizer file
        max_seq_len (int): the context length
        truncate (bool): whether to truncate the token sequence when longer than max_seq_len
       Tpathmax_seq_lentruncatec                 f    t                                          |           || _        || _        d S )N)super__init__r   r   )selfr
   r   r   	__class__s       r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/t5/_tokenizer.pyr   zT5Tokenizer.__init__   s.    &     textreturnc                     t                                          |dddd          }t          |          | j        k    r*| j        s
J d            |d| j                 }| j        |d<   |S )z
        Given a string, return the encoded list of token ids.

        Args:
            text (str): The text to encode.

        Returns:
            List[int]: The encoded list of token ids.
        FTN)add_bosadd_eostrim_leading_whitespaceprefixzWTokenized text is larger than the maximum sequence length but truncate is set to False.)r   encodelenr   r   eos_id)r   r   tokensr   s      r   r   zT5Tokenizer.encode   s     $)   
 
 v;;)))=  , = .d../FF2Jr   Fsample	inferencec                 `    |                     d          }|                     |          |d<   |S )aW  
        Tokenize the "text" field in the sample.

        Args:
            sample (Dict[str, Any]): A sample with a "text" field containing a string to tokenize
            inference (bool): Unused by this tokenizer

        Returns:
            Dict[str, Any]: The sample with added "tokens" field and the "messages" field removed.
        r   r   )popr   )r   r    r!   r   s       r   __call__zT5Tokenizer.__call__6   s0     zz&!!;;t,,xr   )r	   T)F)__name__
__module____qualname____doc__strintboolr   r   r   r   r   r$   __classcell__)r   s   @r   r   r      s         ! !S !s !D ! ! ! ! ! !
3 49      6 9> 38n15	c3h       r   r   N)typingr   r   r   6torchtune.modules.transforms.tokenizers._sentencepiecer   r    r   r   <module>r0      sx    # " " " " " " " " "     
8 8 8 8 8, 8 8 8 8 8r   