
     `i                         d dl Z d dlmZ d dlmZ d dlmZ ddlmZ ddl	m
Z
mZ  e
            rdd	lmZ ndZ ej        e          Zd
ddZ G d de          ZdgZdS )    N)copyfile)Optional)
processors   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )GemmaTokenizerztokenizer.modelztokenizer.json)
vocab_filetokenizer_filec                        e Zd ZdZeZeZdZddgZ		 	 	 	 	 	 	 	 	 d fd	Z
d Zed             Zed             Zej        d             Zej        d             Zddedee         dee         fdZddZ xZS )GemmaTokenizerFastu
  
    Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.

    This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`

    ```python
    >>> from transformers import GemmaTokenizerFast

    >>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
    >>> tokenizer.encode("Hello this is a test")
    [2, 4521, 736, 603, 476, 2121]
    ```

    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.


    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The padding token
        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
    left	input_idsattention_maskNF<unk><bos><eos><pad>Tc
                      t                      j        d|||||||||	d	|
 || _        |	| _        |                                  || _        d S )N)	r   r   clean_up_tokenization_spaces	unk_token	bos_token	eos_token	pad_tokenadd_bos_tokenadd_eos_token )super__init___add_bos_token_add_eos_tokenupdate_post_processorr   )selfr   r   r   r   r   r   r   r   r   kwargs	__class__s              /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/gemma/tokenization_gemma_fast.pyr!   zGemmaTokenizerFast.__init__W   s~     	 	
!))E''	
 	
 	
 	
 	
 ,+""$$$$    c                    | j         }| j        }|| j        rt          d          | j        }| j        }|| j        rt          d          | j        r|dz   nd d| j        rd|z   dz   nd }| | j        rd|z   d	z   nd d
| j        rd|z   d	z   nd }g }| j        r|                    ||f           | j        r|                    ||f           t          j	        |||          | j
        _        dS )ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = Nonez)add_eos_token = True but eos_token = Nonez:0  z$A:0 z:0z:1z $B:1)singlepairspecial_tokens)r   bos_token_idr   
ValueErrorr   eos_token_idr   appendr   TemplateProcessing
_tokenizerpost_processor)r%   bosr0   eosr2   r-   r.   r/   s           r(   r$   z(GemmaTokenizerFast.update_post_processorv   sk    n(;4-;HIIIn(;4-;HIII%)%7?S5[[Rww[_[mEucCiRVFVFVsuww  D0BJ39t++  D  Dgkgy  RBRUX[R[^bRbRb  @B  D  D 	7!!3"5666 	7!!3"5666)3)F^*
 *
 *
&&&r)   c                     | j         S N)r#   r%   s    r(   r   z GemmaTokenizerFast.add_eos_token       ""r)   c                     | j         S r:   )r"   r;   s    r(   r   z GemmaTokenizerFast.add_bos_token   r<   r)   c                 <    || _         |                                  d S r:   )r#   r$   r%   values     r(   r   z GemmaTokenizerFast.add_eos_token   "    #""$$$$$r)   c                 <    || _         |                                  d S r:   )r"   r$   r?   s     r(   r   z GemmaTokenizerFast.add_bos_token   rA   r)   save_directoryfilename_prefixreturnc                    | j         st          d          t          j                            |          s t
                              d| d           d S t          j                            ||r|dz   ndt          d         z             }t          j        	                    | j
                  t          j        	                    |          k    rt          | j
        |           |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory-r+   r   )can_save_slow_tokenizerr1   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r%   rC   rD   out_vocab_files       r(   save_vocabularyz"GemmaTokenizerFast.save_vocabulary   s    + 	  
 w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNT_n555  r)   c                 t    | j         r| j        gng }| j        r| j        gng }||z   |z   }|||z   |z   |z   }|S r:   )r   r0   r   r2   )r%   token_ids_0token_ids_1r0   r2   outputs         r(    build_inputs_with_special_tokensz3GemmaTokenizerFast.build_inputs_with_special_tokens   s`    .2.@H)**b.2.@H)**b+l:"l*[8<GFr)   )	NNFr   r   r   r   TFr:   )__name__
__module____qualname____doc__rO   vocab_files_namesr   slow_tokenizer_classpadding_sidemodel_input_namesr!   r$   propertyr   r   setterstrr   tuplerR   rW   __classcell__)r'   s   @r(   r   r   "   sT       - -^ *)L$&67 %*% % % % % %>
 
 
4 # # X# # # X# % % % % % %
! !c !HSM !]bcf]g ! ! ! !(	 	 	 	 	 	 	 	r)   r   )rI   shutilr   typingr   
tokenizersr   tokenization_utils_fastr   utilsr   r	   tokenization_gemmar   
get_loggerrX   rL   rO   r   __all__r   r)   r(   <module>rm      s    
			             ! ! ! ! ! ! > > > > > > 8 8 8 8 8 8 8 8  2222222N		H	%	%#4HXYY ^ ^ ^ ^ ^0 ^ ^ ^B  
 r)   