§
     `ƒi   ã                   ó  — U d Z ddlmZ ddlmZmZ ddlmZ  ej        e	¦  «        Z
dZdZdZdZd	Zd
ZdZedededededediZeeef         ed<   d„ e                     ¦   «         D ¦   «         Zeeef         ed<    G d„ de¦  «        ZdgZdS )z Tokenization classes for CANINE.é    )ÚOptionalé   )Ú
AddedTokenÚPreTrainedTokenizer)Úloggingi   i à  ià  ià  ià  ià  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]ÚSPECIAL_CODEPOINTSc                 ó   — i | ]\  }}||“Œ	S © r
   )Ú.0Ú	codepointÚnames      ú‚/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/canine/tokenization_canine.pyú
<dictcomp>r   7   s   € Ð-pÐ-pÐ-pÁ/À)ÈT¨d°IÐ-pÐ-pÐ-pó    ÚSPECIAL_CODEPOINTS_BY_NAMEc            
       óº  ‡ — e Zd ZdZ ee¦  «         ee¦  «         ee¦  «         ee¦  «         ee¦  «         ee¦  «        ddfˆ fd„	Z	e
defd„¦   «         Zd„ Zdedee         fd	„Zd
edefd„Zdedefd„Zd„ Z	 ddee         deee                  dee         fd„Z	 ddee         deee                  dedee         fˆ fd„Zddedee         fd„Zˆ xZS )ÚCanineTokenizeraé  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fi   c	                 ó2  •— t          |t          ¦  «        rt          |dd¬¦  «        n|}t          |t          ¦  «        rt          |dd¬¦  «        n|}t          |t          ¦  «        rt          |dd¬¦  «        n|}t          |t          ¦  «        rt          |dd¬¦  «        n|}t          |t          ¦  «        rt          |dd¬¦  «        n|}t          |t          ¦  «        rt          |dd¬¦  «        n|}i | _        t                               ¦   «         D ]\  }
}|
| j        |<   Œd„ | j                             ¦   «         D ¦   «         | _        t          | _        t          | j        ¦  «        | _
         t          ¦   «         j        d||||||||dœ|	¤Ž d S )NF)ÚlstripÚrstripTc                 ó   — i | ]\  }}||“Œ	S r
   r
   )r   r   r   s      r   r   z,CanineTokenizer.__init__.<locals>.<dictcomp>c   s+   € ð ;
ð ;
ð ;
Ù /  iˆItð;
ð ;
ð ;
r   )Ú	bos_tokenÚ	eos_tokenÚ	sep_tokenÚ	cls_tokenÚ	pad_tokenÚ
mask_tokenÚadd_prefix_spaceÚmodel_max_lengthr
   )Ú
isinstanceÚstrr   Ú_special_codepointsr   ÚitemsÚ_special_codepoint_stringsÚUNICODE_VOCAB_SIZEÚ_unicode_vocab_sizeÚlenÚ_num_special_tokensÚsuperÚ__init__)Úselfr   r   r   r   r   r   r   r   Úkwargsr   r   Ú	__class__s               €r   r*   zCanineTokenizer.__init__H   sä  ø€ õ JTÐT]Õ_bÑIcÔIcÐr•J˜y°¸uÐEÑEÔEÐEÐirˆ	ÝISÐT]Õ_bÑIcÔIcÐr•J˜y°¸uÐEÑEÔEÐEÐirˆ	ÝISÐT]Õ_bÑIcÔIcÐr•J˜y°¸uÐEÑEÔEÐEÐirˆ	ÝISÐT]Õ_bÑIcÔIcÐr•J˜y°¸uÐEÑEÔEÐEÐirˆ	ÝISÐT]Õ_bÑIcÔIcÐr•J˜y°¸uÐEÑEÔEÐEÐirˆ	õ KUÐU_ÕadÑJeÔJeÐu•Z 
°4ÀÐFÑFÔFÐFÐkuˆ
ð 46ˆÔ Ý1×7Ò7Ñ9Ô9ð 	7ð 	7‰OˆItØ-6ˆDÔ$ TÑ*Ð*ð;
ð ;
Ø37Ô3K×3QÒ3QÑ3SÔ3Sð;
ñ ;
ô ;
ˆÔ'õ $6ˆÔ Ý#& tÔ'?Ñ#@Ô#@ˆÔ à‰ŒÔð 
	
ØØØØØØ!Ø-Ø-ð
	
ð 
	
ð ð
	
ð 
	
ð 
	
ð 
	
ð 
	
r   Úreturnc                 ó   — | j         S ©N)r&   )r+   s    r   Ú
vocab_sizezCanineTokenizer.vocab_sizev   s   € àÔ'Ð'r   c                 óv   — d„ t          | j        ¦  «        D ¦   «         }|                     | j        ¦  «         |S )Nc                 ó.   — i | ]}t          |¦  «        |“ŒS r
   )Úchr)r   Úis     r   r   z-CanineTokenizer.get_vocab.<locals>.<dictcomp>{   s    € Ð;Ð;Ð;˜q•Q‘”˜Ð;Ð;Ð;r   )Úranger1   ÚupdateÚadded_tokens_encoder)r+   Úvocabs     r   Ú	get_vocabzCanineTokenizer.get_vocabz   s9   € Ø;Ð;¥E¨$¬/Ñ$:Ô$:Ð;Ñ;Ô;ˆØŠTÔ.Ñ/Ô/Ð/Øˆr   Útextc                 ó    — t          |¦  «        S )z5Tokenize a string (i.e. perform character splitting).)Úlist)r+   r;   s     r   Ú	_tokenizezCanineTokenizer._tokenize   s   € åD‰zŒzÐr   Útokenc                 ód   — 	 t          |¦  «        S # t          $ r t          d|› d¦  «        ‚w xY w)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: 'ú')ÚordÚ	TypeErrorÚ
ValueError)r+   r?   s     r   Ú_convert_token_to_idz$CanineTokenizer._convert_token_to_idƒ   sH   € ð	:Ýu‘:”:ÐøÝð 	:ð 	:ð 	:ÝÐ8°Ð8Ð8Ð8Ñ9Ô9Ð9ð	:øøøs   ‚ ‘/Úindexc                 óŽ   — 	 |t           v rt           |         S t          |¦  «        S # t          $ r t          d|› ¦  «        ‚w xY w)z˜
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r   r4   rC   rD   )r+   rF   s     r   Ú_convert_id_to_tokenz$CanineTokenizer._convert_id_to_tokenŠ   s\   € ð
	5ØÕ*Ð*Ð*Ý)¨%Ô0Ð0Ýu‘:”:ÐøÝð 	5ð 	5ð 	5ÝÐ3¨EÐ3Ð3Ñ4Ô4Ð4ð	5øøøs   ‚' ˜' §Ac                 ó,   — d                      |¦  «        S )NÚ )Újoin)r+   Útokenss     r   Úconvert_tokens_to_stringz(CanineTokenizer.convert_tokens_to_string–   s   € ØwŠwv‰ŒÐr   NÚtoken_ids_0Útoken_ids_1c                 óJ   — | j         g}| j        g}||z   |z   }||||z   z  }|S )a˜  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )Úsep_token_idÚcls_token_id)r+   rN   rO   ÚsepÚclsÚresults         r   Ú build_inputs_with_special_tokensz0CanineTokenizer.build_inputs_with_special_tokens™   sC   € ð& Ô Ð!ˆØÔ Ð!ˆà{Ñ" SÑ(ˆØÐ"Øk CÑ'Ñ'ˆFØˆr   Úalready_has_special_tokensc                 óÂ   •— |r$t          ¦   «                              ||d¬¦  «        S dgdgt          |¦  «        z  z   dgz   }||dgt          |¦  «        z  dgz   z  }|S )aÄ  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rN   rO   rW   é   r   )r)   Úget_special_tokens_maskr'   )r+   rN   rO   rW   rU   r-   s        €r   rZ   z'CanineTokenizer.get_special_tokens_mask´   s…   ø€ ð$ &ð 	Ý‘7”7×2Ò2Ø'°[Ð]að 3ñ ô ð ð ˜˜c +Ñ.Ô.Ñ.Ñ/°1°#Ñ5ˆØÐ"Ø˜sS Ñ-Ô-Ñ-°!°Ñ4Ñ4ˆFØˆr   Úsave_directoryÚfilename_prefixc                 ó   — dS )Nr
   r
   )r+   r[   r\   s      r   Úsave_vocabularyzCanineTokenizer.save_vocabularyÑ   s   € Øˆrr   r0   )NF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r4   ÚCLSÚSEPÚPADÚMASKr*   ÚpropertyÚintr1   r:   r!   r=   r>   rE   rH   rM   r   rV   ÚboolrZ   r^   Ú__classcell__)r-   s   @r   r   r   :   s  ø€ € € € € ðð ð #c‘(”(Ø#c‘(”(Ø#c‘(”(Ø#c‘(”(Ø#c‘(”(Ø3t‘9”9ØØð,
ð ,
ð ,
ð ,
ð ,
ð ,
ð\ ð(˜Cð (ð (ð (ñ „Xð(ðð ð ð
˜cð  d¨3¤ið ð ð ð ð:¨#ð :°#ð :ð :ð :ð :ð
5¨#ð 
5°#ð 
5ð 
5ð 
5ð 
5ðð ð ð JNðð Ø œ9ðØ3;¸DÀ¼IÔ3Fðà	ˆcŒðð ð ð ð8 sxðð Ø œ9ðØ3;¸DÀ¼IÔ3FðØkoðà	ˆcŒðð ð ð ð ð ð:ð ¨cð ÀHÈSÄMð ð ð ð ð ð ð ð r   r   N)rb   Útypingr   Útokenization_utilsr   r   Úutilsr   Ú
get_loggerr_   Úloggerr%   re   rc   rd   ÚBOSrf   ÚRESERVEDr   Údictrh   r!   Ú__annotations__r#   r   r   Ú__all__r
   r   r   ú<module>ru      sB  ðð 'Ð &Ð &à Ð Ð Ð Ð Ð à AÐ AÐ AÐ AÐ AÐ AÐ AÐ AØ Ð Ð Ð Ð Ð ð 
ˆÔ	˜HÑ	%Ô	%€ð Ð ð €Ø€Ø€Ø€Ø€Ø€ð ˆØˆØˆØˆ(ØˆØˆlð&Ð D˜˜c˜”Nð ð ñ ð  .qÐ-pÐUg×UmÒUmÑUoÔUoÐ-pÑ-pÔ-pÐ ˜D  c œNÐ pÐ pÑ pðXð Xð Xð Xð XÐ)ñ Xô Xð Xðv Ð
€€€r   