
     `i	                         d Z ddlmZ ddlmZ ddlmZ ddlmZ  ej	        e
          Zdd	d
dZ G d de          ZdgZdS )z)Fast Tokenization classes for OpenAI GPT.    )Optional   )PreTrainedTokenizerFast)logging   )OpenAIGPTTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                   x     e Zd ZdZeZddgZeZd fd	Z	e
d             Zdded	ee         d
ee         fdZ xZS )OpenAIGPTTokenizerFasta  
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsattention_maskN<unk>c                 D     t                      j        ||f||d| d S )N)r   	unk_token)super__init__)selfr	   r
   r   r   kwargs	__class__s         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/openai/tokenization_openai_fast.pyr   zOpenAIGPTTokenizerFast.__init__6   s3    [o[doohnooooo    c                     dS )NT )r   s    r   do_lower_casez$OpenAIGPTTokenizerFast.do_lower_case9   s    tr   save_directoryfilename_prefixreturnc                 b    | j         j                            ||          }t          |          S )N)name)
_tokenizermodelsavetuple)r   r   r   filess       r   save_vocabularyz&OpenAIGPTTokenizerFast.save_vocabulary=   s+    %**>*PPU||r   )NNNr   )N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr   slow_tokenizer_classr   propertyr   strr   r%   r'   __classcell__)r   s   @r   r   r      s         ( *$&67-p p p p p p   X c HSM ]bcf]g        r   r   N)r+   typingr   tokenization_utils_fastr   utilsr   tokenization_openair   
get_loggerr(   loggerr,   r   __all__r   r   r   <module>r:      s    0 /       > > > > > >       3 3 3 3 3 3 
	H	%	%#/`pqq " " " " "4 " " "J $
$r   