
    Pi~3                         d dl Z d dlmZmZmZmZmZmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZmZ dZddd	d
ddddddddddZdZd  ee ee          z
            D             Zi eeZ G d dee          ZdS )    N)AnyDictListMappingOptionalTuple)MessagePromptTemplatetruncate)	Transform)ModelTokenizerTikTokenBaseTokenizerzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+i  i i i i i i i i i	 i
 i  i )<|begin_of_text|><|end_of_text|>z<|reserved_special_token_0|>z<|reserved_special_token_1|><|finetune_right_pad_id|><|step_id|><|start_header_id|><|end_header_id|>
<|eom_id|>
<|eot_id|><|python_tag|>	<|image|>z	<|video|>   c                 (    i | ]}d d|z    dd|z   S )z<|reserved_special_token_   z|>i  ).0is     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/llama3/_tokenizer.py
<dictcomp>r    &   s=       	 *A)))6A:      c            
          e Zd ZdZ	 	 	 d%dedeeeef                  dee         dee         fdZ	d Z
d	ed
efdZed
efd            Zed
efd            Z	 	 d&d	ededed
ee         fdZ	 	 d&dee         deded
efdZded
ee         fdZded
ee         fdZded
ee         fdZddddededed
ee         fdZdddee         ded
eee         ee         f         fd Z	 d'd"eeef         d#ed
eeef         fd$ZdS )(Llama3Tokenizera&  
    tiktoken tokenizer configured with Llama3 Instruct's special tokens, as described in
    https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3

    Args:
        path (str): Path to pretrained tiktoken tokenizer file.
        special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and
            their registered token IDs. If left as None, this will be set to the canonical
            Llama3 special tokens.
        max_seq_len (Optional[int]): maximum sequence length for tokenizing a single list of messages,
            after which the input will be truncated. Default is None.
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens. Default is None.

    Examples:
        >>> tokenizer = Llama3Tokenizer("/path/to/tt_model")
        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
        >>> print(tokenized_text)
        [1, 31587, 29644, 102, 2]
    Npathspecial_tokensmax_seq_lenprompt_templatec                    ||nt           | _        |                                  | j        d         | _        | j        d         | _        | j        d         | _        | j        d         | _        | j        d         | _        | j        d         | _        | j        d         | _	        | j        d         | _
        | j        d	         | _        | j        d
         | _        | j        | j	        | j
        g| _        t          |dt          | j        | j        | j                  | _        || _        || _        t'          j        d          | _        t'          j        d          | _        d S )Nr   r   r   r   r   r   r   r   r   r   llama3_tiktoken)r$   namepatternbos_ideos_idr%   z	<\|.*?\|>z/<\|start_header_id\|>.*?<\|end_header_id\|>\n\n)LLAMA3_SPECIAL_TOKENSr%   _validate_special_tokensr,   r-   pad_idstep_idstart_header_idend_header_ideot_ideom_id
python_tagimage_idstop_tokensr   CL100K_PATTERNtt_modelr&   r'   recompile_special_token_regex_special_token_header_regex)selfr$   r%   r&   r'   s        r   __init__zLlama3Tokenizer.__init__K   sZ    -8NN>S 	 	%%''' )*=>)*;<)*EF*=9  $23HI!01DE),7),7-.>? +K8 !KdkB-"";;.
 
 
 '. %'J|$<$<!+-:>,
 ,
(((r!   c                 F    dD ]}|| j         vrt          | d          dS )zV
        Validate that required special tokens are passed into the tokenizer.
        )r   r   r   r   r   r   r   z missing from special_tokensN)r%   
ValueError)r?   tokens     r   r/   z(Llama3Tokenizer._validate_special_tokens~   sM    
 
	I 
	IE D/// E!G!G!GHHH 0
	I 
	Ir!   textreturnc                 j    | j                             d| j                            d|                    S )z@
        Remove special tokens from the decoded string.
         )r=   subr>   )r?   rD   s     r   _remove_special_tokensz&Llama3Tokenizer._remove_special_tokens   s7    
 (,,044R>>
 
 	
r!   c                     | j         j        S N)r:   base_vocab_sizer?   s    r   rL   zLlama3Tokenizer.base_vocab_size   s    },,r!   c                     | j         j        S rK   )r:   
vocab_sizerM   s    r   rO   zLlama3Tokenizer.vocab_size   s    }''r!   Tadd_bosadd_eosc                 <    | j                             |||          S )N)rD   rP   rQ   )r:   encode)r?   rD   rP   rQ   s       r   rS   zLlama3Tokenizer.encode   s!     }##w#PPPr!   	token_idstruncate_at_eosskip_special_tokensc                 l    | j                             ||          }|r|                     |          n|S )a  
        Decode a list of token ids into a string.

        Args:
            token_ids (List[int]): The list of token ids.
            truncate_at_eos (bool): Whether to truncate the string at the end of
                sequence token. Default is True.
            skip_special_tokens (bool): Whether to show or skip special tokens in the decoded string.
                Default is True.

        Returns:
            str: The decoded string.
        )rT   rU   )r:   decoderI   )r?   rT   rU   rV   decoded_strings        r   rX   zLlama3Tokenizer.decode   sK    . --+ . 
 
 # D''777	
r!   messagec                     | j         g|                     |j                                        dd          z   | j        gz   |                     ddd          z   S )zT
        Tokenize header start, message role, and header end as list of ids
        FrP   rQ   z

)r2   rS   rolestripr3   r?   rZ   s     r   _tokenize_headerz Llama3Tokenizer._tokenize_header   sf    
 !"kk',,,..ukMMN!"# kk&%k??@	
r!   c                 0    |j         r| j        gn| j        gS )z>
        Add eot or eom id at the end of the message.
        )eotr4   r5   r_   s     r   _tokenize_endzLlama3Tokenizer._tokenize_end   s     !(>}}$+>r!   c                     g }|j         D ]q}|d         dk    r4||                     |d                                         dd          z  }B|d         dk    r|| j        gz  }Zt	          d|d                    |j        r| j        g|z   }|S )z9
        Tokenize message content as list of ids
        typerD   contentFr\   imagez"Unsupported message content type: )rf   rS   r^   r7   RuntimeErroripythonr6   )r?   rZ   tokenized_bodyitems       r   _tokenize_bodyzLlama3Tokenizer._tokenize_body   s     O 	X 	XDF|v%%$++O))++UE #. # #  f((4=/1"#VV#V#VWWW? 	@"o.?Nr!   )add_start_tokensadd_end_tokensrm   rn   c                    |r|                      |          ng }|                     |          }|r|                     |          ng }||z   |z   }|S )a  
        Tokenize a message into a list of token ids.

        Args:
            message (Message): The message to tokenize.
            add_start_tokens (bool): Whether to prepend a tokenized header to the message. Default is True.
            add_end_tokens (bool): Whether to append eot or eom id at the end of the message. Default is True.

        Returns:
            List[int]: The list of token ids.
        )r`   rl   rc   )r?   rZ   rm   rn   tokenized_headerrj   tokenized_endtokenized_messages           r   tokenize_messagez Llama3Tokenizer.tokenize_message   sj    $ >NU400999SU,,W557EM**73332,~=M  r!   rn   messagesc                   | j         |                      |          n|}| j        g}dg}t          |          }t          |          D ]j\  }}||dz
  k    r|nd}	|                     ||	          }
||
z   }||j        gt          |
          z  z   }| j        rt          |          | j        k    r nk|r|| j        gz   }|dgz   }| j        r9t          || j        |r| j        nd          }t          || j        |rdnd          }||fS )a  
        Tokenize a list of messages into a list of token ids and masks.

        Args:
            messages (List[Message]): The list of messages to tokenize.
            add_end_tokens (bool): Whether to append end tokens ids (end-of-seq, end-of-turn, end-of-message) at the end of the
                last assistant message. This value should be set to False for generation. Default is True.

        Examples:
            >>> # Tokenize a list of messages with default settings
            >>> messages = [
            ...     Message(role="user", content="Hello world!", masked=True),
            ...     Message(role="assistant", content="How are you?", masked=False),
            ... ]
            >>> tokenizer = Llama3Tokenizer("/path/to/tt_model")
            >>> tokenizer.tokenize_messages(messages)
            ([1, 31587, 29644, 102, 1, 31587, 29644, 102, 2], [True, True, True, True, True, False, False, False, True])

            >>> # Tokenize a list of messages with add_end_tokens set to False
            >>> tokenizer.tokenize_messages(messages, add_end_tokens=False)
            ([1, 31587, 29644, 102, 1, 31587, 29644], [True, True, True, True, True, False, False])

        Returns:
            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.
        NT   rt   )	r'   r,   len	enumeraters   maskedr&   r-   r   )r?   ru   rn   templated_messagestokensmasknum_messagesr   rZ   add_end_tokens_to_messagerr   s              r   tokenize_messagesz!Llama3Tokenizer.tokenize_messages  sq   B #/   *** 	
 +v-..#$677 	 	JAw #$|a'7"7"7T & !% 5 5(A !6 ! ! //FGN+c2C.D.DDED CKK43C$C$C 	!t{m+F4&=D 	V(*Q$++T F D$"2N4TDDPTUUDt|r!   Fsample	inferencec                 z    |                     d          }|                     ||           \  }}||d<   ||d<   |S )a  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
        ru   rt   r|   r}   )popr   )r?   r   r   ru   r|   r}   s         r   __call__zLlama3Tokenizer.__call__L  sK     ::j))--h9}-UU!xvr!   )NNN)TT)F)__name__
__module____qualname____doc__strr   r   intr
   r@   r/   rI   propertyrL   rO   boolr   rS   rX   r	   r`   rc   rl   rs   r   r   r   r   r   r   r!   r   r#   r#   .   s        > 48%)481
 1
1
 !c3h01
 c]	1

 ".11
 1
 1
 1
fI I I$
3 
3 
 
 
 
 - - - - X- (C ( ( ( X( 	Q QQ Q 	Q
 
cQ Q Q Q !%$(	
 
9
 
 "	

 

 
 
 
B	
 	
DI 	
 	
 	
 	
?W ?c ? ? ? ?g $s)    0 "&#! ! !! 	!
 ! 
c! ! ! !:  $	B B Bw-B 	B
 
tCy$t*$	%B B B BJ <A c3h'48	c	     r!   r#   )r;   typingr   r   r   r   r   r   torchtune.datar	   r
   r   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   r9   SPECIAL_TOKENSNUM_RESERVED_SPECIAL_TOKENSrangerx   RESERVED_TOKENSr.   r#   r   r!   r   <module>r      sg   
			 < < < < < < < < < < < < < < < < < < < < < < < < < < 2 2 2 2 2 2        L  $*$*!'!   "  U.^1D1DDEE  
 >>=_= q q q q qni q q q q qr!   