
    Pi(                         d dl mZmZmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ ddd	d
ddddddddZ G d dee          ZdS )    )AnyDictListMappingOptionalTuple)Message)PromptTemplate)truncate)	Transform)ModelTokenizerSentencePieceBaseTokenizer }  i}  i}  i}  i}  i}  i}  i}  i}  i	}  i
}  )<|endoftext|><|assistant|>z<|placeholder1|>z<|placeholder2|>z<|placeholder3|>z<|placeholder4|>
<|system|><|end|>z<|placeholder5|>z<|placeholder6|><|user|>c                   p   e Zd ZdZ	 	 	 ddedeeeef                  dee         dee         fdZ	e
d             Ze
d	             Z	 	 	 ddededededee         f
dZddee         dedefdZddddee         dededeee         ee         f         fdZ	 ddeeef         dedeeef         fdZdS ) Phi3MiniTokenizera  
    SentencePiece tokenizer configured with Phi3 Mini's special tokens.

    Args:
        path (str): Path to pretrained tokenizer file.
        special_tokens (Optional[Dict[str, int]]): mapping containing special text tokens and
            their registered token IDs. If left as None, this will be set to the canonical
            Phi3 special tokens.
        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
            Default: None
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens. Default is None.

    Examples:
        >>> tokenizer = Phi3MiniTokenizer("/path/to/spm_model")
        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
        >>> print(tokenized_text)
        [1, 31587, 29644, 102, 2]
    Npathspecial_tokensmax_seq_lenprompt_templatec                     t          |          | _        ||nt          | _        | j        d         | _        | j        d         | _        | j        g| _        || _        || _        d S )Nr   )	r   
_spm_modelPHI3_SPECIAL_TOKENSr   eos_idpad_idstop_tokensr   r   )selfr   r   r   r   s        t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/phi3/_tokenizer.py__init__zPhi3MiniTokenizer.__init__=   sn     5T:: -8NN>Q 	
 )/:)/: !K=&.    c                     | j         j        S N)r   
vocab_sizer!   s    r"   r'   zPhi3MiniTokenizer.vocab_sizeU   s    ))r$   c                     | j         j        S r&   )r   bos_idr(   s    r"   r*   zPhi3MiniTokenizer.bos_idY   s    %%r$   TFtextadd_bosadd_eostrim_leading_whitespacereturnc                 >    | j                             ||||          S )Nr,   r-   r.   )r   encode)r!   r+   r,   r-   r.   s        r"   r2   zPhi3MiniTokenizer.encode]   s/     %%$;	 & 
 
 	
r$   idsskip_special_tokensc                     g }|D ]&}|r|dk    r|dk    r|                     |           '| j                            |          S )a0  Decode token IDs to strings.

        Args:
            ids (List[int]): The input token IDs to be decoded.
            skip_special_tokens (bool): Whether to show or skip special tokens in the decoded string.
                Default is True.

        Returns:
            str: The decoded text.
        r   i@}  )appendr   decode)r!   r3   r4   ids_for_decodetoken_ids        r"   r7   zPhi3MiniTokenizer.decodek   sf      	0 	0H # 0F(:(:x6?Q?Q%%h////%%n555r$   )r-   ignore_system_promptmessagesr:   c          	      p   | j         |                      |          n|}d}d}g }g }|                     ddd          }	|D ]}
|r|
j        dk    r|r4|                    | j                   |                    |
j                   |
j        dk    r;|                    | j        d                    |                    |
j                   n|
j        d	k    r=|                    | j        d
                    d}|                    |
j                   nf|
j        dk    r;|                    | j        d                    |                    |
j                   n t          d|
j         d|
j         d          |	                    |	           |	                    |
j        gt          |	          z             g }|
j        D ][}|d         dk    r6||                     |d                             d          ddd          z   }Dt          d|d                    || j        d         gz   |	z   }|	                    |           |	                    |
j        gt          |          z             |r;|r9|                    | j                   |                    |
j                   d}d}nd}| j        rt          |          | j        k    r n| j        rVt          |          | j        k    r>t          || j        |r| j        nd          }t          || j        |r|
j        nd          }||fS )a  Tokenize a list of messages one at a time then concatenate them,
        returning a list of tokens and a list of masks.

        Example:
            >>> tokenizer = Phi3MiniTokenizer(tokenizer_path, max_seq_len)
            >>> messages = [
                Message(role="system", content="system message\n", masked=True),
                Message(role="user", content="user prompt\n", masked=True),
                Message(role="assistant", content="assistant response\n"),
            ]

            >>> # tokenize_messages encodes messages separately and concats
            >>> tokenizer.tokenize_messages(messages)[0]
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]

            >>> # Same result as encoding the full string in one go
            >>> tokenizer.encode(''.join([message.content for message in messages]))
            [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2]


        Args:
            messages (List[Message]): A list of messages, each containing role, content,
                and masked attributes.
            add_eos (bool): Whether to append EOS after assistant message, default to False
            ignore_system_prompt (bool): Whether to ignore system prompt, defaults to False.

        Raises:
            ValueError: If the role is not "user", "assistant", or "system".
            RuntimeError: If ``message["type"] != "text``.

        Returns:
            Tuple[List[int], List[bool]]: The tokenized messages
        NTF
)r,   r-   systemuserr   	assistantr   r   zUnknown role 'z' for message: ''typer+   content r1   z"Unsupported message content type: r   )r   r2   roler6   r*   maskedr   
ValueErrorrC   extendlenrstripRuntimeErrorr   r   r   )r!   r;   r-   r:   templated_messagesstart_of_turnend_of_turntokenized_messagesmasknew_line_token_idmessagetokensitems                r"   tokenize_messagesz#Phi3MiniTokenizer.tokenize_messages   s   T #/   *** 	  !KKeUKKK) =	 =	G# (@(@  ,"))$+666GN+++ |v%%"))$*=j*IJJJGN++++,,"))$*=o*NOOO"GN++++))"))$*=l*KLLLGN++++ UW\UU7?UUU  
 %%&7888KK(3/@+A+AABBB F  <6))#dkkY..s33 % %04	 '2 ' ' FF 'KT&\KK   t29=>>ARRF%%f---KK(3v;;6777  &w &"))$+666GN+++# $ %  C(:$;$;t?O$O$O  	Y$6 7 74;K K K!)"D$4W6VdkkRV" " D$"2g4WGNNSWXXD!4''r$   sample	inferencec                 t    |                     d          }|                     |          \  }}||d<   ||d<   |S )a%  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
            inference (bool): Whether the template is being used for inference or not.
        r;   rS   rP   )poprU   )r!   rV   rW   r;   rS   rP   s         r"   __call__zPhi3MiniTokenizer.__call__   sD      ::j))--h77!xvr$   )NNN)TTF)T)F)__name__
__module____qualname____doc__strr   r   intr
   r#   propertyr'   r*   boolr   r2   r7   r	   r   rU   r   r   rZ    r$   r"   r   r   !   s        < 48%)48/ // !c3h0/ c]	/
 ".1/ / / /0 * * X* & & X& (-
 

 
 	

 "&
 
c
 
 
 
6 6$s) 6$ 6# 6 6 6 62 %*|( |( |(w-|( 	|(
 #|( 
tCy$t*$	%|( |( |( |(~ <A c3h'48	c	     r$   r   N)typingr   r   r   r   r   r   torchtune.data._messagesr	    torchtune.data._prompt_templatesr
   torchtune.data._utilsr   torchtune.modules.transformsr   'torchtune.modules.transforms.tokenizersr   r   r   r   rc   r$   r"   <module>rj      s   = < < < < < < < < < < < < < < < , , , , , , ; ; ; ; ; ; * * * * * * 2 2 2 2 2 2          q q q q q	 q q q q qr$   