
    Pi@                         d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlmZmZmZmZ d dlmZ dZddd	d
ZdZdZdZdZ e            d             Zd Z G d de          ZdS )    N)	lru_cache)AnyDictListMappingOptionalTuple)ChatMLTemplateMessagePromptTemplatetruncate)ModelTokenizerzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+i[P i\P i]P )<|endoftext|><|im_start|>
<|im_end|>r   r   r   i R c            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )a7  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoid mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S  )chr).0ns     u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchtune/models/qwen2/_tokenizer.py
<listcomp>z$bytes_to_unicode.<locals>.<listcomp>9   s    			Q#a&&			    )listrangeordappenddictzip)bscsr   bs       r    bytes_to_unicoder,   "   s    	U3s88SXX\**++
uSYYD		A..
/
/	0
uSYYD		A..
/
/	0 
 
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr"   c                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charchars       r    	get_pairsr4   =   sP     EEEQIQRR  		9d#$$$		Lr"   c                      e Zd ZdZedfddddeeeddedede	ee
f         dee
         d	ee         d
edee         dee         dedee         de
fdZd Zd Zd Z	 d(dedededee
         fdZde
defdZdee         defdZ	 d)dee
         d edefd!Zdd"d#ee         dedeee
         ee         f         fd$Z	 d)d%eeef         d&edeeef         fd'ZdS )*Qwen2Tokenizera  This class construct a Qwen2 tokenizer, based on GPT-2 byte-level BPE tokenization.

    See <https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/qwen2/tokenization_qwen2.py>.

    Args:
        path (str): Path to vocab.json file.
        merges_file (str): Path to merges.txt file.
            merges.txt contains all BPE merge operations, and this file is required to split a single word into
            byte-level BPE tokens.
        special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is QWEN2_SPECIAL_TOKENS.
        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
            Default: None
        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
            to add structured text around the actual messages. The structured text is used in three scenarios:

            - Task-specific templates to gear models for a particular task that it will expect after training
            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
              tags in Llama2 and in Mistral
            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`

            The extra text will still get tokenized as normal text, not as special tokens.
            Default: None
        errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace".
            See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted
            to an ID and is set to be this token instead. Defaults to ``<|endoftext|>``.
        bos_token (Optional[str]): The beginning of sequence token. Defaults to None.
        eos_token (str): The end of sequence token. Defaults to ``<|endoftext|>``.
        pad_token (Optional[str]): The token used for padding. Defaults to ``<|endoftext|>``.
        bpe_cache_size (int): BPE token cache size in Qwen2Tokenizer.
            NOTE: large cache size will speed up tokenization, but the cache object will get really
            large for long running processes (esp. for texts of language that do not use space between
            word, e.g. Chinese); technically not a memory leak but appears as one.
            By default, we set the cache size equals to size of the official Qwen2 tokenizer.

    Example:
        >>> tokenizer = Qwen2Tokenizer(
                path="/path/to/vocab.json", merges_file="/path/to/merges.txt", special_tokens=QWEN2_SPECIAL_TOKENS)
        >>> tokenized_text = tokenizer.encode("Hello world!")
        >>> print(tokenized_text)
        [39, 385, 78, 675, 0, 2000]
    Nreplace)prompt_templateerrors	unk_token	bos_token	eos_token	pad_tokenbpe_cache_sizepathmerges_filespecial_tokensmax_seq_lenr8   r9   r:   r;   r<   r=   r>   c          
         t          |d          5 }t          j        |          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        || _        t                      | _        d | j                                        D             | _	        g }t          |d          5 }t          |          D ]k\  }}|                                }|dk    r|                    d          s|s7|                    t          |                                                     l	 d d d            n# 1 swxY w Y   t!          t#          |t%          t'          |                                        | _         t+          |          | j                  | _        t1          j        t4                    | _        || _        d | j                                        D             | _        |d n| j        |         | _        |d n| j        |         | _        |	d n| j        |	         | _         |
d n| j        |
         | _!        | j        tD                   | _#        | j        tH                   | _%        | j         | j!        g| _&        t1          j        d	| j        '                                
          | _(        || _)        || _*        d S )Nutf-8)encodingc                     i | ]\  }}||	S r   r   r   kvs      r    
<dictcomp>z+Qwen2Tokenizer.__init__.<locals>.<dictcomp>   s    >>>A1>>>r"   c                     i | ]\  }}||	S r   r   rG   s      r    rJ   z+Qwen2Tokenizer.__init__.<locals>.<dictcomp>   s    HHHdaQHHHr"   r   z	#version:)maxsizec                     i | ]\  }}||	S r   r   rG   s      r    rJ   z+Qwen2Tokenizer.__init__.<locals>.<dictcomp>   s    (V(V(V$!QA(V(V(Vr"   z(\L<options>))options)+openjsonloadencoderitemsdecoderr9   r,   byte_encoderbyte_decoder	enumeratestrip
startswithr&   tuplesplitr'   r(   r$   len	bpe_ranksr   _bpe_without_cache_bperecompilePRETOKENIZE_REGEXpatrA   _special_tokens_reversedunk_idbos_ideos_idpad_idIM_STARTim_start_idIM_END	im_end_idstop_tokenskeys_pattern_split_special_tokensrB   r8   )selfr?   r@   rA   rB   r8   r9   r:   r;   r<   r=   r>   vocab_handle
bpe_mergesmerges_handleilines                    r    __init__zQwen2Tokenizer.__init__w   s    $))) 	3\9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 ?>););)=)=>>>,..HHd.?.E.E.G.GHHH
+000 	7M$]33 7 74zz||FFt{;;FDF!!%

"5"56666	7	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 c*eC
OO.D.DEEFF5In555d6MNN	:/00,(V(V$:M:S:S:U:U(V(V(V%'/ddT5H5S'/ddT5H5S'/ddT5H5S'/ddT5H5S.x8,V4 K5 .0Zd&9&>&>&@&@.
 .
 .
* '.s   8<< A<E		EEc                 &    t          |          }t          |          }|s|S 	 t          | fd          }| j        vrn8|\  }}g }d}|t	          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t	          |          dz
  k     r-||dz            |k    r|                    ||z              |dz  }n |                    ||                    |dz  }|t	          |          k     t          |          }|}t	          |          dk    rnt          |          }Wd	                    |          }|S )NTc                 T    j                             | t          d                    S )Ninf)r]   getfloat)pairrp   s    r    <lambda>z3Qwen2Tokenizer._bpe_without_cache.<locals>.<lambda>   s     1C1CD%PU,,1W1W r"   )keyr   r       )
rZ   r4   minr]   r\   indexextend
ValueErrorr&   join)
rp   tokenr0   r1   bigramfirstsecondnew_wordrt   js
   `         r    r^   z!Qwen2Tokenizer._bpe_without_cache   s   U||$ 	L	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~s   !B 'C Cc                 8    g }t          j         j        |          D ]{}d                     fd|                    d          D                       }|                    d                      |                              d          D                        ||S )zTokenize a string. c              3   2   K   | ]}j         |         V  d S N)rU   )r   r+   rp   s     r    	<genexpr>z+Qwen2Tokenizer._tokenize.<locals>.<genexpr>   s<        )*!!$     r"   rD   c              3      K   | ]}|V  d S r   r   )r   	bpe_tokens     r    r   z+Qwen2Tokenizer._tokenize.<locals>.<genexpr>   s"      UUIiUUUUUUr"   r   )r`   findallrc   r   encoder   r_   r[   )rp   text
bpe_tokensr   s   `   r    	_tokenizezQwen2Tokenizer._tokenize   s    
Z$// 	V 	VEGG    .3ll7.C.C    E UU59I9I9O9OPS9T9TUUUUUUUr"   c                 B    | j                             || j                  S )z0Converts a token (str) in an id using the vocab.)rR   rz   re   )rp   r   s     r    _convert_token_to_idz#Qwen2Tokenizer._convert_token_to_id   s    |t{333r"   Tr   add_bosadd_eosreturnc                 "   t          j        d|          }| j                            |          }g }|D ]L}|s|| j        v r|                    |           $|                    |                     |                     Mg }|r!| j        |                    | j                   |D ]C}|| j        v r| j        |         }n| 	                    |          }|                    |           D|r!| j
        |                    | j
                   |S )a  
        Encode a string into a list of token ids.

        Args:
            text (str): The string to encode.
            add_bos (bool): (Optional) Whether to add the beginning of sequence token.
            add_eos (bool): (Optional) Whether to add the end of sequence token.

        Returns:
            List[int]: The list of token ids.

        Note:
            This method follows
            <https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/tokenization_utils.py#L541> and
            <https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/qwen2/tokenization_qwen2.py#L262>.
        NFC)unicodedata	normalizero   r[   rA   r&   r   r   rf   r   rg   )	rp   r   r   r   tokenstokenized_textr   	token_idstoken_ids	            r    r   zQwen2Tokenizer.encode   s@   ( $UD11399$?? 	= 	=E +++%%e,,,,%%dnnU&;&;<<<< 	 	*t{.T[)))# 	' 	'E+++.u544U;;X&&&& 	*t{.T[)))r"   r   c                 t    | j                             |d          }|| j                            |          S |S )z=Converts an index (integer) in a token (str) using the vocab.N)rd   rz   rT   )rp   r   r   s      r    _convert_id_to_tokenz#Qwen2Tokenizer._convert_id_to_token  s:    -11%>>=<##E***r"   r   c                      d                     |          }t           fd|D                                           d j                  }|S )z:Converts a sequence of tokens (string) in a single string.r   c                 *    g | ]}j         |         S r   )rV   )r   crp   s     r    r!   z<Qwen2Tokenizer._convert_tokens_to_string.<locals>.<listcomp>  s!    ===1$+A.===r"   rD   )r9   )r   	bytearraydecoder9   )rp   r   r   s   `  r    _convert_tokens_to_stringz(Qwen2Tokenizer._convert_tokens_to_string  sX    wwv=======>>EEDK F 
 
 r"   Fr   skip_special_tokensc                    g }g }|D ]}}|                      |          }|| j        v rH|r.|                     |          }|r|                    |           g }|s|                    |           h|                    |           ~|r(|                    |                     |                     d                    |          }|S )a)  
        Decode a list of token ids into a string.

        Args:
            token_ids (List[int]): The list of token ids.
            skip_special_tokens (bool): Whether the special tokens should be removed from the decoded string.

        Returns:
            str: The decoded string.
        r   )r   rd   r   r&   r   )	rp   r   r   	sub_textscurrent_sub_textr   r   stringr   s	            r    r   zQwen2Tokenizer.decode!  s     	! 	/ 	/H--h77E4888# *!;;<LMMF 1!((000')$* ,$$U+++ ''.... 	OT;;<LMMNNNwwy!!r"   )r   messagesc          	         t          | j        t                    r
J d            | j        |                     |          n|}g }g }t          |          D ]\  }}g }|j        dk    rM|                    | j                   |                    |                     |j         ddd                     |j	        D ]W}	|	d         dk    r2|                    |                     |	d	         dd                     @t          d
|	d                    |j        dk    rf|j        dk    s|t          |          dz
  k    rE|                    | j                   |                    |                     ddd                     |                    |           |                    |j        gt          |          z             | j        rt          |          | j        k    r n|r5|                    | j                   |                    |d                    | j        r9t!          || j        |r| j        nd          }t!          || j        |rdnd          }||fS )a  
        Given a list of messages, return a list of tokens for the concatenated
        and formatted messages.

        Args:
            messages (List[Message]): The message list to tokenize.
            add_eos (bool): Wether to add the tokenizer's eos_id at the end of the
                sequence of messages. Default is True.

        Returns:
            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.

        Raises:
            RuntimeError: If a message contains non-text content
        zUsing ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message.Please use a different template or set to None.Nipython
F)r   r   typer   contentz"Unsupported message content type: 	assistantr   T)
isinstancer8   r
   rW   roler&   rj   r   r   r   RuntimeErrorr\   rl   maskedrB   rg   r   )
rp   r   r   templated_messagestokenized_messagesmaskr   messager   items
             r    tokenize_messagesz Qwen2Tokenizer.tokenize_messagesD  s   * d2NCC 	
 	
>	
 	
C #/   *** 	  '(:;; %	 %	NE7F |y((d.///KK7< 3 3 3UEKRR  
    <6))MM O$)$) $      'KT&\KK  
 |y((++uH8I/I/Idn---dkk$ukMMNNN%%f---KK(3v;;6777  C(:$;$;t?O$O$O  	"%%dk222KKR!!!  	O!)"D$4W6VdkkRV" " D$"2G4MDDNND!4''r"   sample	inferencec                 t    |                     d          }|                     |          \  }}||d<   ||d<   |S )a%  
        Apply ``tokenize_messages`` to the "messages" field in the sample.

        Args:
            sample (Mapping[str, Any]): A sample with a "messages" field containing
                a List[Message] to tokenize
            inference (bool): Whether the template is being used for inference or not.

        Returns:
            Mapping[str, Any]: The sample with added "tokens" and "mask" fields
                and the "messages" field removed.
            inference (bool): Whether the template is being used for inference or not.
        r   r   r   )popr   )rp   r   r   r   r   r   s         r    __call__zQwen2Tokenizer.__call__  sD      ::j))--h77!xvr"   )TT)F)__name__
__module____qualname____doc__QWEN2_SPECIAL_TOKENSrk   	ENDOFTEXT&DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZEstrr   intr   r   rv   r^   r   r   boolr   r   r   r   r   r   r	   r   r   r   r   r   r"   r    r6   r6   K   s       ) )^ *>%)4/ 59#'#'#,D4/ 4/ 4/4/ 4/ S#X	4/
 c]4/ ".14/ 4/ C=4/ C=4/ 4/ C=4/ 4/ 4/ 4/ 4/l% % %N  4 4 4
 @D. .."&.8<.	c. . . .`# #    S	 c     %*! !9! "! 
	! ! ! !N 	T( T( T(w-T( 	T(
 
tCy$t*$	%T( T( T( T(n <A c3h'48	c	     r"   r6   )rP   r   	functoolsr   typingr   r   r   r   r   r	   regexr`   torchtune.datar
   r   r   r   'torchtune.modules.transforms.tokenizersr   rb   r   r   ri   rk   r   r,   r4   r6   r   r"   r    <module>r      sH              < < < < < < < < < < < < < < < <     L L L L L L L L L L L L B B B B B BY     		)/ &   4  c c c c c^ c c c c cr"   