
    Pi                         d dl mZmZmZmZmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZmZ g dZ G d dee          Zd	S )
    )AnyListMappingOptionalTuple)MessagePromptTemplate)Llama2ChatTemplate)	Transform)ModelTokenizerSentencePieceBaseTokenizer#tokenize_messages_no_special_tokens) 
	c                      e Zd ZdZd e            fdedee         dee         fdZ	e
d             Ze
d             Ze
d	             Ze
d
             Z	 	 	 ddededededee         f
dZdee         defdZddddee         dededeee         ee         f         fdZ	 ddeeef         dedeeef         fdZdS )Llama2Tokenizera  
    Llama2's implementation of the SentencePiece tokenizer. Llama2Tokenizer does
    not include any additional special tokens. The prompt template described in
    https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2/ describes
    [INST][/INST] and <<SYS>><</SYS>> as special tokens but these are not registered
    as unique ids and are tokenized as normal text. When using this tokenizer on the
    pre-trained model for inference, the prompt template
    :class:`~torchtune.models.llama2.Llama2ChatTemplate` is by default applied to your data
    before tokenization to add the [INST] and <<SYS>> tags for optimal performance.
    For more details, see https://pytorch.org/torchtune/main/tutorials/chat.html#tokenizing-prompt-templates-special-tokens.

    Args:
        path (str): Path to pretrained SentencePiece tokenizer file.
        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
            Default: None
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens.
            Default is :class:`~torchtune.models.llama2.Llama2ChatTemplate`.

    Examples:
        >>> tokenizer = Llama2Tokenizer("/path/to/spm_model")
        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
        >>> print(tokenized_text)
        [1, 31587, 29644, 102, 2]
    Npathmax_seq_lenprompt_templatec                 |    t          |          | _        d| j        _        | j        g| _        || _        || _        d S )Nr   )r   
_spm_modelpad_ideos_idstop_tokensr   r   )selfr   r   r   s       v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/llama2/_tokenizer.py__init__zLlama2Tokenizer.__init__7   sC     5T:: "# !K=&.    c                     | j         j        S N)r   r   r   s    r   r   zLlama2Tokenizer.eos_idI       %%r!   c                     | j         j        S r#   )r   bos_idr$   s    r   r'   zLlama2Tokenizer.bos_idM   r%   r!   c                     | j         j        S r#   )r   r   r$   s    r   r   zLlama2Tokenizer.pad_idQ   r%   r!   c                     | j         j        S r#   )r   
vocab_sizer$   s    r   r*   zLlama2Tokenizer.vocab_sizeU   s    ))r!   TFtextadd_bosadd_eostrim_leading_whitespacereturnc                 >    | j                             ||||          S )N)r,   r-   r.   )r   encode)r   r+   r,   r-   r.   s        r   r1   zLlama2Tokenizer.encodeY   s/     %%$;	 & 
 
 	
r!   	token_idsc                 6    | j                             |          S r#   )r   decode)r   r2   s     r   r4   zLlama2Tokenizer.decodeg   s     %%i000r!   )add_start_tokensadd_end_tokensmessagesr5   r6   c                    | j         |                      |          n|}t          | ||r| j        nd|r| j        nd          S )a  Tokenize a list of messages one at a time then concatenate them,
        returning a list of tokens and a list of masks.

        Note:
            sentencepiece has problems where in general
            encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling.
            We can get around this by prepending s2 with a known token and slicing the
            beginning off the tokenized s2.

        Example:
            >>> tokenizer = Llama2Tokenizer(tokenizer_path, max_seq_len)
            >>> messages = [
                Message(role="system", content="system message\n", masked=True),
                Message(role="user", content="user prompt\n", masked=True),
                Message(role="assistant", content="assistant response\n"),
            ]

            >>> # tokenize_messages encodes messages separately and concats
            >>> tokenizer.tokenize_messages(messages)[0]
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]

            >>> # Same result as encoding the full string in one go
            >>> tokenizer.encode(''.join([message.content for message in messages]))
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]


        Args:
            messages (List[Message]): A list of messages, each containing role, content,
                and masked attributes.
            add_start_tokens (bool): Whether to add BOS token to the beginning of the first message.
                Default True.
            add_end_tokens (bool): Whether to add EOS token to the end of the last message. Default True.

        Returns:
            Tuple[List[int], List[bool]]: The tokenized messages
        N)	tokenizerr7   r'   r   )r   r   r'   r   )r   r7   r5   r6   templated_messagess        r   tokenize_messagesz!Llama2Tokenizer.tokenize_messagesm   sg    Z #/   *** 	
 3'"2<4;;"0:4;;d	
 
 
 	
r!   sample	inferencec                 z    |                     d          }|                     ||           \  }}||d<   ||d<   |S )a  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
        r7   )r6   tokensmask)popr;   )r   r<   r=   r7   r?   r@   s         r   __call__zLlama2Tokenizer.__call__   sK     ::j))--h9}-UU!xvr!   )TTF)F)__name__
__module____qualname____doc__r
   strr   intr	   r    propertyr   r'   r   r*   boolr   r1   r4   r   r   r;   r   r   rB    r!   r   r   r      s        H &*4F4F4H4H	/ // c]/ ".1	/ / / /$ & & X& & & X& & & X& * * X* (-
 

 
 	

 "&
 
c
 
 
 
191 
1 1 1 1 "&#5
 5
 5
w-5
 	5

 5
 
tCy$t*$	%5
 5
 5
 5
p <A c3h'48	c	     r!   r   N)typingr   r   r   r   r   torchtune.datar   r	   (torchtune.models.llama2._prompt_templater
   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   r   WHITESPACE_CHARSr   rK   r!   r   <module>rR      s    7 6 6 6 6 6 6 6 6 6 6 6 6 6 2 2 2 2 2 2 2 2 G G G G G G 2 2 2 2 2 2          100 b b b b bni b b b b br!   