
     `iZ                         d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
  ej        e          Zd	d
ddZddiZ G d de          ZdgZdS )zTokenization classes for Qwen2.    )Optional   )
AddedToken)PreTrainedTokenizerFast)logging   )Qwen2Tokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filezqwen/qwen-tokenizeri   c                   p     e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 d fd	Z	dde
dee
         d	ee
         fd
Z xZS )Qwen2TokenizerFastac  
    Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```python
    >>> from transformers import Qwen2TokenizerFast

    >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
    >>> tokenizer("Hello world")["input_ids"]
    [9707, 1879]

    >>> tokenizer(" Hello world")["input_ids"]
    [21927, 1879]
    ```
    This is expected.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead. Not applicable to this tokenizer.
        bos_token (`str`, *optional*):
            The beginning of sequence token. Not applicable for this tokenizer.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskN<|endoftext|>c                    t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|} t                      j        d|||||||d| d S )NFT)lstriprstripspecial
normalized)r
   r   r   	unk_token	bos_token	eos_token	pad_token )
isinstancestrr   super__init__)
selfr
   r   r   r   r   r   r   kwargs	__class__s
            /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/qwen2/tokenization_qwen2_fast.pyr   zQwen2TokenizerFast.__init__S   s'   $ )S))JyudW\]]]] 	 )S))JyudW\]]]] 	 )S))JyudW\]]]] 	 )S))JyudW\]]]] 	 	 		
!#)		
 		
 		
 		
 		
 		
 		
    save_directoryfilename_prefixreturnc                 b    | j         j                            ||          }t          |          S )N)name)
_tokenizermodelsavetuple)r    r%   r&   filess       r#   save_vocabularyz"Qwen2TokenizerFast.save_vocabulary   s+    %**>*PPU||r$   )NNNr   Nr   r   )N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr	   slow_tokenizer_classr   r   r   r-   r/   __classcell__)r"   s   @r#   r   r   %   s        ' 'R *$&67) !!!.
 .
 .
 .
 .
 .
b c HSM ]bcf]g        r$   r   N)r3   typingr   tokenization_utilsr   tokenization_utils_fastr   utilsr   tokenization_qwen2r	   
get_loggerr0   loggerr4   MAX_MODEL_INPUT_SIZESr   __all__r   r$   r#   <module>rB      s    & %       , , , , , , > > > > > >       . . . . . . 
	H	%	% &   /6 a a a a a0 a a aH  
 r$   