
    Pi                     r    d dl Z d dlmZ  G d d          Z G d de          Z G d de          Zd	 ZdS )
    N)Pathc                   ,    e Zd Zd Zd Zd Zd Zd ZdS )TokenizerInterfacec                     || _         d S N)
model_path)selfr   s     s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torchao/_models/llama/tokenizer.py__init__zTokenizerInterface.__init__   s    $    c                      t          d          Nz/This method should be overridden by subclasses.NotImplementedErrorr	   texts     r
   encodezTokenizerInterface.encode       !"STTTr   c                      t          d          r   r   r	   tokenss     r
   decodezTokenizerInterface.decode   r   r   c                      t          d          r   r   r	   s    r
   bos_idzTokenizerInterface.bos_id   r   r   c                      t          d          r   r   r   s    r
   eos_idzTokenizerInterface.eos_id   r   r   N)__name__
__module____qualname__r   r   r   r   r    r   r
   r   r      sm        % % %U U UU U UU U UU U U U Ur   r   c                   6     e Zd Z fdZd Zd Zd Zd Z xZS )SentencePieceWrapperc                    dd l }t                                          |           |                    t	          |                    | _        |                                 | _        |                                 | _	        d S )Nr   )
sentencepiecesuperr   SentencePieceProcessorstr	processorr   bos_token_idr   eos_token_id)r	   r   spm	__class__s      r
   r   zSentencePieceWrapper.__init__   sh    ####$$$33C
OODD KKMM KKMMr   c                 6    | j                             |          S r   )r)   EncodeAsIdsr   s     r
   r   zSentencePieceWrapper.encode&   s    ~))$///r   c                 6    | j                             |          S r   )r)   	DecodeIdsr   s     r
   r   zSentencePieceWrapper.decode)   s    ~''///r   c                 4    | j                                         S r   )r)   r   r   s    r
   r   zSentencePieceWrapper.bos_id,       ~$$&&&r   c                 4    | j                                         S r   )r)   r   r   s    r
   r   zSentencePieceWrapper.eos_id/   r3   r   )	r   r   r    r   r   r   r   r   __classcell__r-   s   @r
   r#   r#      st        * * * * *0 0 00 0 0' ' '' ' ' ' ' ' 'r   r#   c                   ^     e Zd ZU dZeeef         ed<   dZdZ	 fdZ
d Zd Zd Zd	 Z xZS )
TiktokenWrapperzM
    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
    special_tokens   zs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+c                    dd l }dd l}t                                          |           t          j                            |          sJ t          |                      |j        	                    t          |                    }t          |          g dd t          d| j        dz
            D             z   }fdt          |          D             | _        |                    t!          |          j        | j        || j                  | _        | j        d         | _        | j        d         | _        |                                 | _        |                                 | _        d S )	Nr   )
<|begin_of_text|><|end_of_text|>z<|reserved_special_token_0|>z<|reserved_special_token_1|>z<|reserved_special_token_2|>z<|reserved_special_token_3|>z<|start_header_id|>z<|end_header_id|>z<|reserved_special_token_4|>z
<|eot_id|>c                     g | ]}d | d	S )z<|reserved_special_token_z|>r!   ).0is     r
   
<listcomp>z,TiktokenWrapper.__init__.<locals>.<listcomp>Q   s3     
 
 
 .---
 
 
r      c                 "    i | ]\  }}||z   S r!   r!   )r?   r@   tokennum_base_tokenss      r
   
<dictcomp>z,TiktokenWrapper.__init__.<locals>.<dictcomp>U   s1     
 
 
+31eE?Q&
 
 
r   )namepat_strmergeable_ranksr9   r<   r=   )tiktokentiktoken.loadr&   r   ospathisfiler(   loadload_tiktoken_bpelenrangenum_reserved_special_tokens	enumerater9   Encodingr   rG   rH   model_bos_id_eos_idr   r*   r   r+   )r	   r   rJ   rI   r9   rE   r-   s        @r
   r   zTiktokenWrapper.__init__>   sz   $$$w~~j))::3z??::)"-99#j//JJo..
 
 

 
1d>BCC
 
 


 
 
 
7@7P7P
 
 
 &&j!!&L+.	 ' 
 

 !/0CD /0AB KKMM KKMMr   c                 6    | j                             |          S r   )rV   r   r   s     r
   r   zTiktokenWrapper.encoded   s    z  &&&r   c                 6    | j                             |          S r   )rV   r   r   s     r
   r   zTiktokenWrapper.decodeg   s    z  (((r   c                     | j         S r   )rW   r   s    r
   r   zTiktokenWrapper.bos_idj   
    |r   c                     | j         S r   )rX   r   s    r
   r   zTiktokenWrapper.eos_idm   r\   r   )r   r   r    __doc__dictr(   int__annotations__rS   rH   r   r   r   r   r   r5   r6   s   @r
   r8   r8   3   s           cN""""% EG$* $* $* $* $*L' ' ') ) )        r   r8   c                 `    dt          |          v rt          |           S t          |           S )a?  
    Factory function to get the appropriate tokenizer based on the model name.

    Args:
    - tokenizer_model_path (str): The file path to the tokenizer model.
    - model_name (str): The name of the model, used to determine the tokenizer type.
    Returns:
    - TokenizerInterface: An instance of a tokenizer.
    zLlama-3)r(   r8   r#   )tokenizer_model_path
model_names     r
   get_tokenizerre   q   s3     C
OO##3444#$8999r   )rL   pathlibr   r   r#   r8   re   r!   r   r
   <module>rg      s    
			      U U U U U U U U"' ' ' ' '- ' ' ',; ; ; ; ;( ; ; ;|: : : : :r   