
     `i                         U d Z ddlmZ ddlmZmZ ddlmZ  ej        e	          Z
dZdZdZdZd	Zd
ZdZedededededediZeeef         ed<   d e                                D             Zeeef         ed<    G d de          ZdgZdS )z Tokenization classes for CANINE.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingi   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSc                     i | ]\  }}||	S  r
   ).0	codepointnames      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/canine/tokenization_canine.py
<dictcomp>r   7   s    -p-p-p/)TdI-p-p-p    SPECIAL_CODEPOINTS_BY_NAMEc            
           e Zd ZdZ ee           ee           ee           ee           ee           ee          ddf fd	Z	e
defd            Zd Zdedee         fd	Zd
edefdZdedefdZd Z	 ddee         deee                  dee         fdZ	 ddee         deee                  dedee         f fdZddedee         fdZ xZS )CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fi   c	                 2   t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}i | _        t                                          D ]\  }
}|
| j        |<   d | j                                        D             | _        t          | _        t          | j                  | _
         t                      j        d||||||||d|	 d S )NF)lstriprstripTc                     i | ]\  }}||	S r
   r
   )r   r   r   s      r   r   z,CanineTokenizer.__init__.<locals>.<dictcomp>c   s+     ;
 ;
 ;
 /iIt;
 ;
 ;
r   )	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_lengthr
   )
isinstancestrr   _special_codepointsr   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargsr   r   	__class__s               r   r*   zCanineTokenizer.__init__H   s    JTT]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	 KUU_adJeJeuZ
4FFFFku
 46 17799 	7 	7OIt-6D$T**;
 ;
373K3Q3Q3S3S;
 ;
 ;
' $6 #&t'?#@#@  
	
!--
	
 
	
 
	
 
	
 
	
 
	
 
	
r   returnc                     | j         S N)r&   )r+   s    r   
vocab_sizezCanineTokenizer.vocab_sizev   s    ''r   c                 v    d t          | j                  D             }|                    | j                   |S )Nc                 .    i | ]}t          |          |S r
   )chr)r   is     r   r   z-CanineTokenizer.get_vocab.<locals>.<dictcomp>{   s     ;;;qQ;;;r   )ranger1   updateadded_tokens_encoder)r+   vocabs     r   	get_vocabzCanineTokenizer.get_vocabz   s9    ;;E$/$:$:;;;T.///r   textc                      t          |          S )z5Tokenize a string (i.e. perform character splitting).)list)r+   r;   s     r   	_tokenizezCanineTokenizer._tokenize   s    Dzzr   tokenc                 d    	 t          |          S # t          $ r t          d| d          w xY w)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r+   r?   s     r   _convert_token_to_idz$CanineTokenizer._convert_token_to_id   sH    	:u:: 	: 	: 	:8888999	:s    /indexc                     	 |t           v rt           |         S t          |          S # t          $ r t          d|           w xY w)z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r   r4   rC   rD   )r+   rF   s     r   _convert_id_to_tokenz$CanineTokenizer._convert_id_to_token   s\    
	5***)%00u:: 	5 	5 	53E33444	5s   ' ' Ac                 ,    d                     |          S )N )join)r+   tokenss     r   convert_tokens_to_stringz(CanineTokenizer.convert_tokens_to_string   s    wwvr   Ntoken_ids_0token_ids_1c                 J    | j         g}| j        g}||z   |z   }||||z   z  }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_id)r+   rN   rO   sepclsresults         r    build_inputs_with_special_tokensz0CanineTokenizer.build_inputs_with_special_tokens   sC    &  ! !{"S("kC''Fr   already_has_special_tokensc                     |r$t                                          ||d          S dgdgt          |          z  z   dgz   }||dgt          |          z  dgz   z  }|S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rN   rO   rW      r   )r)   get_special_tokens_maskr'   )r+   rN   rO   rW   rU   r-   s        r   rZ   z'CanineTokenizer.get_special_tokens_mask   s    $ & 	7722'[]a 3    c+.../1#5"sS---!44Fr   save_directoryfilename_prefixc                     dS )Nr
   r
   )r+   r[   r\   s      r   save_vocabularyzCanineTokenizer.save_vocabulary   s    rr   r0   )NF)__name__
__module____qualname____doc__r4   CLSSEPPADMASKr*   propertyintr1   r:   r!   r=   r>   rE   rH   rM   r   rV   boolrZ   r^   __classcell__)r-   s   @r   r   r   :   s         #c((#c((#c((#c((#c((3t99,
 ,
 ,
 ,
 ,
 ,
\ (C ( ( ( X(  
c d3i    :# :# : : : :
5# 
5# 
5 
5 
5 
5   JN 93;DI3F	c   8 sx 93;DI3Fko	c     : c HSM        r   r   N)rb   typingr   tokenization_utilsr   r   utilsr   
get_loggerr_   loggerr%   re   rc   rd   BOSrf   RESERVEDr   dictrh   r!   __annotations__r#   r   r   __all__r
   r   r   <module>ru      sB   ' & &       A A A A A A A A       
	H	%	%    (l& DcN     .q-pUgUmUmUoUo-p-p-p DcN p p pX X X X X) X X Xv 
r   