
    Pi:'                         d dl mZmZmZmZ d dlmZmZmZm	Z	 d dl
mZmZmZmZmZ i ei dddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(Z G d) d*e          Zd+S ),    )DictListOptionalTuple)ChatMLTemplateMessagePromptTemplatetruncate)&DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE	ENDOFTEXTIM_ENDQWEN2_SPECIAL_TOKENSQwen2Tokenizerz<|object_ref_start|>i^P z<|object_ref_end|>i_P z<|box_start|>i`P z<|box_end|>iaP z<|quad_start|>ibP z<|quad_end|>icP z<|vision_start|>idP z<|vision_end|>ieP z<|vision_pad|>ifP z<|image_pad|>igP z<|video_pad|>ihP <tool_call>iiP </tool_call>ijP z<|fim_prefix|>ikP z<|fim_middle|>ilP z<|fim_suffix|>imP z<|fim_pad|>inP ioP ipP )z<|repo_name|>z<|file_sep|>c                       e Zd ZdZedfddddeeeddedede	ee
f         dee
         d	ee         d
edee         dee         dedee         de
f fdZdddee         dedeee
         ee         f         fdZd Zd Zd Zd Z xZS )Qwen2_5Tokenizerad  This class construct a Qwen2.5 tokenizer, based on GPT-2 byte-level BPE tokenization.

    See <https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/qwen2/tokenization_qwen2.py>
    and <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/tokenizer_config.json>.

    Args:
        path (str): Path to vocab.json file.
        merges_file (str): Path to merges.txt file.
            merges.txt contains all BPE merge operations, and this file is required to split a single word into
            byte-level BPE tokens.
        special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is QWEN2_5_SPECIAL_TOKENS.
        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
            Default: None
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens.
            Default: None
        errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace".
            See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted
            to an ID and is set to be this token instead. Defaults to ``<|endoftext|>``.
        bos_token (Optional[str]): The beginning of sequence token. Defaults to None.
        eos_token (str): The end of sequence token. Defaults to ``<|endoftext|>``.
        pad_token (Optional[str]): The token used for padding. Defaults to ``<|endoftext|>``.
        bpe_cache_size (int): BPE token cache size in Qwen2Tokenizer.
            NOTE: large cache size will speed up tokenization, but the cache object will get really
            large for long running processes (esp. for texts of language that do not use space between
            word, e.g. Chinese); technically not a memory leak but appears as one.
            By default, we set the cache size equals to size of the official Qwen2 tokenizer.

    Example:
        >>> tokenizer = Qwen2Tokenizer(
                path="/path/to/vocab.json", merges_file="/path/to/merges.txt", special_tokens=QWEN2_SPECIAL_TOKENS)
        >>> tokenized_text = tokenizer.encode("Hello world!")
        >>> print(tokenized_text)
        [39, 385, 78, 675, 0, 2000]
    Nreplace)prompt_templateerrors	unk_token	bos_token	eos_token	pad_tokenbpe_cache_sizepathmerges_filespecial_tokensmax_seq_lenr   r   r   r   r   r   r   c                    t                                          |||||||||	|
|           | j        d         | _        | j        d         | _        d S )N)r   r   r   r   r   r   r   r   r   r   r   r   r   )super__init__r   tool_call_start_idtool_call_end_id)selfr   r   r   r   r   r   r   r   r   r   r   	__class__s               w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/qwen2_5/_tokenizer.pyr"   zQwen2_5Tokenizer.__init__W   so     	#)#+) 	 	
 	
 	
 #'"5m"D $ 3N C    T)add_eosmessagesr)   returnc          	         t          | j        t                    r
J d            | j        |                     |          n|}g }g }t          |          D ]\  }}|                     ||          }|j        D ]W}	|	d         dk    r2|                    |                     |	d         dd                     @t          d|	d                    |                    | 	                    ||                     |                    |           |                    |j
        gt          |          z             | j        rt          |          | j        k    r n|r5|                    | j                   |                    |d	                    | j        r9t          || j        |r| j        nd          }t          || j        |rd
nd          }||fS )a  
        Given a list of messages, return a list of tokens for the concatenated
        and formatted messages.

        Args:
            messages (List[Message]): The message list to tokenize.
            add_eos (bool): Wether to add the tokenizer's eos_id at the end of the
                sequence of messages. Default is True.

        Returns:
            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.

        Raises:
            RuntimeError: If a message contains non-text content
        zUsing ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message.Please use a different template or set to None.NtypetextcontentFadd_bosr)   z"Unsupported message content type: T)
isinstancer   r   	enumerate_tokenize_headerr/   extendencodeRuntimeError_tokenize_footermaskedlenr   appendeos_idr
   )
r%   r*   r)   templated_messagestokenized_messagesmaskimessagetokensitems
             r'   tokenize_messagesz"Qwen2_5Tokenizer.tokenize_messagesw   s3   * d2NCC 	
 	
>	
 	
C #/   *** 	  #$677 	 	JAw**+=qAAF    <6))MM O$)$) $      'KT&\KK  
 MM$//0BAFFGGG%%f---KK(3v;;6777  C(:$;$;t?O$O$O  	"%%dk222KKR!!!  	O!)"D$4W6VdkkRV" " D$"2G4MDDNND!4''r(   c                 $   g }||         }|j         dk    r|dk    s||dz
           j         dk    rB|                     |d           |                    |                     ddd                     n|                    |                     ddd                     nr|                     ||j                    |j         d	k    rL|j        rE|                    | j                   |                    |                     d
dd                     |S )Nipythonr      userz<tool_response>
Fr0   z
<tool_response>
	assistant
)role_add_message_start_tokensr6   r7   rG   r<   r#   r%   r*   rA   rC   rB   s        r'   r5   z!Qwen2_5Tokenizer._tokenize_header   s!   1+<9$$Avv!a%-::..vv>>>KK 3UEKRR    KK 5ueKTT    **67<@@@|{**w*d5666dkk$ukMMNNNr(   c                 x   g }||         }|j         dk    r|t          |          dz
  k    s||dz            j         dk    rA|                    |                     ddd                     |                     |           n|                    |                     ddd                     n|j         dk    rL|j        rE|                    |                     ddd                     |                    | j                   |j         dk    s|t          |          dz
  k    r|                     |           |S )NrG   rH   z
</tool_response>Fr0   rJ   rK   )rL   r;   r6   r7   _add_message_end_tokensrG   r<   r$   rN   s        r'   r9   z!Qwen2_5Tokenizer._tokenize_footer   sH   1+<9$$CMMA%%%!a%)=)J)JKK 4eUKSS   ,,V4444KK 4eUKSS    |{**w*dkk$ukMMNNNd3444|{**a3x==13D.D.D,,V444r(   c                     |                     | j                   |                    |                     | ddd                     d S NrK   Fr0   )r<   im_start_idr6   r7   )r%   rC   rL   s      r'   rM   z*Qwen2_5Tokenizer._add_message_start_tokens   sG    d&'''dkkT+++uekLLMMMMMr(   c                     |                     | j                   |                    |                     ddd                     d S rR   )r<   	im_end_idr6   r7   )r%   rC   s     r'   rP   z(Qwen2_5Tokenizer._add_message_end_tokens   s@    dn%%%dkk$ukEEFFFFFr(   )__name__
__module____qualname____doc__QWEN2_5_SPECIAL_TOKENSr   r   r   strr   intr   r	   r"   r   r   boolr   rE   r5   r9   rM   rP   __classcell__)r&   s   @r'   r   r   *   s       * *` *@%)D 59#'#'#,DD D DD D S#X	D
 c]D ".1D D C=D C=D D C=D D D D D D DH 	J( J( J(w-J( 	J(
 
tCy$t*$	%J( J( J( J(X  *  *N N NG G G G G G Gr(   r   N)typingr   r   r   r   torchtune.datar   r   r	   r
   !torchtune.models.qwen2._tokenizerr   r   r   r   r   rZ   r    r(   r'   <module>rc      s   / . . . . . . . . . . . L L L L L L L L L L L L              F & V	
 6 f F  f f V V 6 F f  f!" f#$ 6% & )   0IG IG IG IG IG~ IG IG IG IG IGr(   