
     `i.                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
  e	            rdd	lmZ ndZ e
j        e          Zd
ddZ G d de          ZdgZdS )zTokenization classes for XGLM.    N)copyfile)Optional   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )XGLMTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_filec                       e Zd ZdZeZddgZeZ	 	 	 	 	 	 	 	 d fd		Z		 dd
e
e         dee
e                  de
e         fdZ	 dd
e
e         dee
e                  de
e         fdZddedee         dee         fdZ xZS )XGLMTokenizerFasta{	  
    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
    and [`XLNetTokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`list[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    	input_idsattention_maskN<s></s><unk><pad>c	                   	 d| _         d t          | j                   D             }
	                    dg           pg 	d<   	dxx         	fd|
D             z  cc<    t                      j        |f|||||||d	 || _        d S )N   c                     g | ]}d | d	S )z<madeupword> ).0is     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/xglm/tokenization_xglm_fast.py
<listcomp>z.XGLMTokenizerFast.__init__.<locals>.<listcomp>f   s$    QQQq*a***QQQ    additional_special_tokensc                 (    g | ]}|d          v|S )r   r   )r   wordkwargss     r   r   z.XGLMTokenizerFast.__init__.<locals>.<listcomp>i   s0     0
 0
 0
T@[9\-\-\D-\-\-\r   )r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token)num_madeup_wordsrangegetsuper__init__r   )selfr   r   r#   r$   r%   r&   r'   r(   r"   madeup_words	__class__s            ` r   r-   zXGLMTokenizerFast.__init__X   s     !"QQE$:O4P4PQQQ.4jj9TVX.Y.Y._]_*+*+++ 0
 0
 0
 0
)0
 0
 0
 	
+++ 	
	
)
	
 
	
 
	
 
	
 
	
 %r   token_ids_0token_ids_1returnc                 H    || j         g|z   S | j         g}||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr.   r1   r2   seps       r    build_inputs_with_special_tokensz2XGLMTokenizerFast.build_inputs_with_special_tokens{   s@    ( %&44 ![ 3&,{::r   c                     | j         g}|t          ||z             dgz  S t          ||z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.

        Nr   )r5   lenr6   s       r   $create_token_type_ids_from_sequencesz6XGLMTokenizerFast.create_token_type_ids_from_sequences   sX    $  !s[())QC//3$s*S0;>??1#EEr   save_directoryfilename_prefixc                    | j         st          d          t          j                            |          s t
                              d| d           d S t          j                            ||r|dz   ndt          d         z             }t          j        	                    | j
                  t          j        	                    |          k    rt          | j
        |           |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizer
ValueErrorospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r.   r<   r=   out_vocab_files       r   save_vocabularyz!XGLMTokenizerFast.save_vocabulary   s    + 	  
 w}}^,, 	LLU^UUUVVVFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNT_n555  r   )NNr   r   r   r   r   r   )N)__name__
__module____qualname____doc__rI   vocab_files_namesmodel_input_namesr
   slow_tokenizer_classr-   listintr   r8   r;   strtuplerL   __classcell__)r0   s   @r   r   r   $   sP       - -^ *$&67( !% !% !% !% !% !%H JN; ;9;3;DI3F;	c; ; ; ;4 JNF F9F3;DI3FF	cF F F F0! !c !HSM !]bcf]g ! ! ! ! ! ! ! !r   r   )rP   rC   shutilr   typingr   tokenization_utils_fastr   utilsr   r   tokenization_xglmr
   
get_loggerrM   rF   rI   r   __all__r   r   r   <module>r`      s    % $ 				             > > > > > > 8 8 8 8 8 8 8 8  0000000M 
	H	%	%#<P`aa Y! Y! Y! Y! Y!/ Y! Y! Y!x 
r   