
     `i                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ d
dlmZ d
dlmZ d
dlm Z  d
dl!m"Z" d
dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d
dl.m/Z/m0Z0m1Z1  e1j2        e3          Z4dZ5dZ6dZ7dZ8dZ9e$dz  Z$eeeedZ:e5e8dZ; e0e$           G d de)                      Z<dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec            )           e Zd ZU dZeZdZeee	                  e
d<    fdZedefd            Zedefd            Zedefd            Zdeeef         fd	Zedeeef         fd
            Zedeeef         fd            Zedeeef         fd            Zdeeef         fdZdefdZdefdZedefd            Zedefd            Z	 	 	 	 	 	 	 dGde dee         dee         dededededede!eee"f         e#e          f         fdZ$de%ee&e         f         de%ee#e         f         fdZ'dedefd Z(d!edee         fd"Z)dHd#e#e%eef                  defd$Z*dHd%edefd&Z+	 dHd'e%ee#e         f         d(ede%ee#e         f         fd)Z,dId*ed%ee         d+ede#e         fd,Z-d-e.d.e/d/ed0ed1ee         d2ee         fd3Z0de.j1        e/j2        dd4ddddddddddddfd5e%e#e3         e#e4         e#e5         e#e6         f         d+ed-e.d.e/d/ee         d0ed6ed1ee         d2ee         d7ee         dee         dee         dededededed8ede7f&d9Z8dde.j1        e/j2        dd4ddddddddddddfd*e%e3e5f         d:ee%e3e5f                  d+ed-e.d.e/d/ee         d0ed6ed1ee         d2ee         d7ee         dee         dee         dededededed8ede7f(d;Z9de#e         defd<Z:	 	 dJd=e%ee#e         f         d(ed>ee         defd?Z;	 	 dKd@e%ee<j=        f         dAe!edBf         dCee         dDee         de!edBf         f
dEZ>	 	 	 dLdFZ? xZ@S )MPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc                 h   |                     dd           }|                     dd           }|                     dd           }|                     dd           }|                     dd          }|                     di           }|                    dd          | _        |r|| j        t	          d	          |t          j        |          }	nB||st          j        |          }	n(|rt          |          }	n|t          |                    d
                    }
|
d         d         }|
d         }|
d         }t          ||          \  }	}|                    |           t          |          dk    r|                    |           n| j        !|dur | j        |i |}t          |          }	nZ|sI|                    d
          | _        |                    dg           | _        t          | d          }	d }nt	          d          |	| _        ||                    |j                   d| _        | j        j        }| | j        j        d(i | |                    d|d                    |                    d|d                    |                    d|d                    |                    d|d                    n| j                                         | j        j        }| | j        j        d(i | |                    d|d                    |                    d|d                    |                    d|d                    |                    d|d                    |                    d|d                     t3                      j        d(i | | j        | j        _        d  | j        D             fd!t=          |                                d" #          D             tA          | j!        "                                          d$ D             z   fd%| j#        D             z  t                    dk    rg }| j$        }D ]}tK          |tL                    r|j'        ptQ          |          |v ntQ          |          |v }tK          |tP                    rtM          ||&          }n||_'        |)                    |           |r| *                    |           	 tW          j,        | j-        j.        /                                          }|                    d| j                  | j        k    rFta          tb          |                     d'                    }| j        |d<    |d(i || j-        _.        d S d S # td          $ r Y d S w xY w))Ntokenizer_object__slow_tokenizer	gguf_filer%   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r&   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                 F    h | ]}t          t          |                    S  hashrepr.0tokens     x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py	<setcomp>z3PreTrainedTokenizerFast.__init__.<locals>.<setcomp>   s&    $^$^$^5T$u++%6%6$^$^$^    c                 V    g | ]%\  }}t          t          |                    v#|&S rD   rE   )rI   indexrJ   added_tokens_decoder_hashs      rK   
<listcomp>z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>   sA     
 
 
uDKK  (AAA AAArM   c                     | d         S Nr   rD   )xs    rK   <lambda>z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>   s    STUVSW rM   keyc                 ,    g | ]}t          |          S rD   )strrH   s     rK   rQ   z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>   s    ;b;b;b5CJJ;b;b;brM   c                 $    g | ]}|v|v
|S rD   rD   )rI   rJ   encodertokens_to_adds     rK   rQ   z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>   s7     
 
 
5PWCWCW\aiv\v\vE\v\v\vrM   )specialtyperD   )3popgetr0   r)   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr&   r5   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr/   sorteditemslistadded_tokens_encoderkeysall_special_tokens_extendedall_special_tokens
isinstancer   r]   rY   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr+   slow_tokenizerr-   fast_tokenizer_filer.   r/   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr4   additional_kwargs_truncation_paddingtokensspecial_tokensrJ   
is_specialpre_tok_statepre_tok_classrP   r[   r\   	__class__s                          @@@rK   rr   z PreTrainedTokenizerFast.__init__b   sU   !::&8$??$6==JJ{D11	$jj)94@@JJ{E22	%zz*@"EE &

+=u E E 	/D4M4U0  
 '!]+;<<NN ,Y,*45HIINN 	3NCCNN"-fjj.F.FGGJ%h/=L'4N)*<=0F|Uc0d0d-N-MM*+++$%%))/000&2~U7R7R6T6GGGN3NCCNN 	$jj66DO-3ZZ8SUW-X-XD*3DMMMN!NNr   )%MM.4555,1)o0"-DO-<<<<<lK,EFFF/[1IJJJhH(=>>>3[5LMMMMO))+++?**DO*66X666k8K+@AAA18M3JKKKnh{.CDDDlHX,>???2H=Q4RSSS 	""6"""040I-$^$^DD]$^$^$^!
 
 
 
 &';'A'A'C'C X X X
 
 

 t0557788;b;bTa;b;b;bb 
 
 
 
 
#?
 
 
 	
 }!!F!4N& 
% 
% "%446U]Bc%jjN&BU~5 
 eS)) /&ujAAAEE$.EMe$$$$ ('''
	 Jt'='K'X'X'Z'Z[[M  !3T5JKKtOddd '(;]=N=Nv=V=V W W484I017D}7U7U}7U7U&444 ed  	 	 	 DD		s   BV# #
V10V1returnc                     dS )NTrD   r   s    rK   is_fastzPreTrainedTokenizerFast.is_fast   s    trM   c                     d| j         v r]| j         d                             d          r=t          | d          r+| j        r$t          j                            | j                  S dS dS )z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r&   z.modelFT)vocab_files_namesendswithhasattrr&   ospathisfiler   s    rK   can_save_slow_tokenizerz/PreTrainedTokenizerFast.can_save_slow_tokenizer   si     4111d6L\6Z6c6cdl6m6m1t\** 7t 7w~~do66654rM   c                 8    | j                             d          S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensrh   get_vocab_sizer   s    rK   
vocab_sizez"PreTrainedTokenizerFast.vocab_size   s    
 ---FFFrM   c                 8    | j                             d          S )NTr   )rh   	get_vocabr   s    rK   r   z!PreTrainedTokenizerFast.get_vocab   s    ((4(@@@rM   c                 *    |                                  S N)r   r   s    rK   vocabzPreTrainedTokenizerFast.vocab   s    ~~rM   c                 h    d t          | j                                        d           D             S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 $    i | ]\  }}|j         |S rD   contentrI   vks      rK   
<dictcomp>z@PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<dictcomp>       mmmA	1mmmrM   c                     | d         S rS   rD   items    rK   rU   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>      dhijdk rM   rV   ru   r/   rv   r   s    rK   rx   z,PreTrainedTokenizerFast.added_tokens_encoder   s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmrM   c                 4    | j                                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )rh   get_added_tokens_decoderr   s    rK   r/   z,PreTrainedTokenizerFast.added_tokens_decoder  s     77999rM   c                 h    d t          | j                                        d           D             S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                 $    i | ]\  }}|j         |S rD   r   r   s      rK   r   z;PreTrainedTokenizerFast.get_added_vocab.<locals>.<dictcomp>  r   rM   c                     | d         S rS   rD   r   s    rK   rU   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>  r   rM   rV   r   r   s    rK   get_added_vocabz'PreTrainedTokenizerFast.get_added_vocab  s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmrM   c                     dS )zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        TrD   r   s    rK   __bool__z PreTrainedTokenizerFast.__bool__  s	     trM   c                 8    | j                             d          S )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   s    rK   __len__zPreTrainedTokenizerFast.__len__  s     ---EEErM   c                     | j         S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )rh   r   s    rK   r   z)PreTrainedTokenizerFast.backend_tokenizer%  s    
 rM   c                     | j         j        S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )rh   decoderr   s    rK   r   zPreTrainedTokenizerFast.decoder,  s    
 &&rM   FTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 F   |	d| j         v }|	d| j         v }|r|j        |g|j        z   }	n|g}	t          t                    }
|	D ]}|
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j	                   |r-|
d                             t          |j                             |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        Ntoken_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingrA   )model_input_namesoverflowingr   rw   r}   idstype_idsr   r   offsetsrg   )r   r   r   r   r   r   r   r   r   	encodingsencoding_dictes               rK   _convert_encodingz)PreTrainedTokenizerFast._convert_encoding3  sV   ( !($48N$N! ($48N$N!$ 	#)=)I!
X%99II!
I#D)) 	; 	;A+&--ae444$ C./66qzBBB$ I./66q7GHHH) S34;;A<QRRR% B./66qyAAA ;h'..s15zz:::i''rM   r   c                 t     t          |t                    r                     |          S  fd|D             S )aX  
        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `list[int]`: The token id or list of token ids.
        c                 :    g | ]}                     |          S rD   )#_convert_token_to_id_with_added_voc)rI   rJ   r   s     rK   rQ   zAPreTrainedTokenizerFast.convert_tokens_to_ids.<locals>.<listcomp>p  s'    TTTE88??TTTrM   )r|   rY   r   r   r   s   ` rK   convert_tokens_to_idsz-PreTrainedTokenizerFast.convert_tokens_to_idsb  sF     fc"" 	D;;FCCCTTTTVTTTTrM   rJ   c                 L    | j                             |          }|| j        S |S r   )rh   token_to_idunk_token_id)r   rJ   rO   s      rK   r   z;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocr  s*    ++E22=$$rM   rO   c                 P    | j                             t          |                    S r   )rh   id_to_tokenint)r   rO   s     rK   _convert_id_to_tokenz,PreTrainedTokenizerFast._convert_id_to_tokenx  s    **3u::666rM   
new_tokensc                 n    |r| j                             |          S | j                             |          S r   )rh   add_special_tokensr~   )r   r   r   s      rK   _add_tokensz#PreTrainedTokenizerFast._add_tokens{  s7     	B?55jAAA))*555rM   pairc                 6    | j                             |          S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )rh   num_special_tokens_to_add)r   r   s     rK   r   z1PreTrainedTokenizerFast.num_special_tokens_to_add  s    & 88>>>rM   r   skip_special_tokensc                 <   t          |t                    r| j                            |          S g }|rt	          | j                  nt	                      }|D ]C}t          |          }||v r|                    | j                            |                     D|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )r|   r   rh   r   setall_special_idsr}   )r   r   r   r   ids_to_skiprO   s         rK   convert_ids_to_tokensz-PreTrainedTokenizerFast.convert_ids_to_tokens  s      c3 	4?..s3333FQc$.///CEE 	> 	>EJJE##MM$/55e<<====rM   textr   c                 H     | j         d|||d|                                S )N)r   	text_pairr   rD   )encode_plusr   )r   r   r   r   r   s        rK   tokenizez PreTrainedTokenizerFast.tokenize  s2    tkTTN`kkdjkkrrtttrM   padding_strategyr;   r7   r:   rB   r@   c                    | j         j        | j         j        }|t          j        k    r| j                                          n<|||j        | j        d}d}	nfd|D             }	|	|k    r | j         j        di | |t          j
        k    r|| j                                          dS dS |t          j        k    r|nd}
|
||n| j        | j        | j        | j        |d}||k    r | j         j        di | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r7   r:   r<   r9   c                 >    i | ]}|                     |d           S r   r`   )rI   r   r   s     rK   r   zFPreTrainedTokenizerFast.set_truncation_and_padding.<locals>.<dictcomp>  s)    GGG11kooa66GGGrM   )rA   r9   pad_idr=   r?   rB   rD   )rh   rk   ro   r   DO_NOT_TRUNCATErn   valuer8   rl   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr@   pad_token_idr=   r>   rp   )r   r   r;   r7   r:   rB   r@   r   targetcurrentrA   r   s              @rK   set_truncation_and_paddingz2PreTrainedTokenizerFast.set_truncation_and_padding  si   B o0?*"4"DDD&--/// ) /5!1	 F "GGGGGGG&  11;;F;;;999#**,,,,, $# $47Q#Q#QZZW[F -9-E\\4K\+!^#5&8 F 6!!..8888888 "!rM   r   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrs   c                     t          |t          t          f          s t          dt	          |           d                               ||||||	            j        j        |k    r| j        _         j                            |||          } fd|D             }i }|d         d         D ]fd|D             }||<   d |D             }r;g }t          |          D ]$\  }\  }}||gt          |d	                   z  z  }%||d
<   |d	         D ]}                     ||           t          |||
          S )Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r   r;   r7   r:   rB   r@   )r   is_pretokenizedc                 J    g | ]}                     |	            S ))r   r   r   r   r   r   r   r   )r   )
rI   r   r   r   r   r   r   r   r   r   s
     rK   rQ   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>5  sX      
  
  
  ""!&;&;*C+E'=+ # 	 	 
  
  
rM   r   c                 0    g | ]\  }}|         D ]}|S rD   rD   )rI   r   _r   rW   s       rK   rQ   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>K  s.    NNN74DINNqQNNNNrM   c                 "    g | ]\  }}|D ]}|S rD   rD   )rI   r  r   r   s       rK   rQ   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>M  s)    SSSWQdSSqSSSSrM   r   overflow_to_sample_mapping)tensor_type)r|   tuplerw   	TypeErrorr^   r  rh   rt   encode_batch	enumeraterg   &_eventual_warn_about_too_long_sequencer   )r   r  r   r   r;   r7   r:   r  rB   r@   r  r   r   r   r   r   r   r   rs   r   tokens_and_encodingssanitized_tokensstacksanitized_encodingsr  itoksr  r   rW   s   `          ```````           @rK   _batch_encode_plusz*PreTrainedTokenizerFast._batch_encode_plus   s   . 2UDMBB 	nTRjMkMknnn  
 	''- 3!1% 	( 	
 	
 	
 ?04HHH4HDO1O00$1/ 1 
 
	 
  
  
  
  
  
  
  
  
  
  
 & 
  
  
( '*1- 	* 	*CNNNN&:NNNE$)S!!SS0DSSS % 	X)+& )*> ? ? K K9D!*qcC[8I4J4J.JJ**=W9:)+6 	X 	XI77	:wWWWW-/BP^____rM   r   c                 @   |r||fgn|g} | j         |fi d|d|d|d|d|d|d|	d|
d	|d
|d|d|d|d|d|d|d||}|3|s1t          d |                                D             |j                  }|                     |d         ||           |S )Nr  r   r   r;   r7   r:   rB   r@   r  r   r   r   r   r   r   r   rs   c                     i | ]>\  }}|t          |          d k    r#t          |d          t                    r|d          n|?S )r   )rg   r|   rw   )rI   rW   r  s      rK   r   z8PreTrainedTokenizerFast._encode_plus.<locals>.<dictcomp>  sW       "U c%jj1nnE!Hd9S9Sn%((Y^  rM   r   )r%  r   rv   r   r  )r   r   r   r   r   r;   r7   r:   r  rB   r@   r  r   r   r   r   r   r   r   rs   r   batched_inputbatched_outputs                          rK   _encode_plusz$PreTrainedTokenizerFast._encode_plus[  s   . 09D$	*++tf00
 
 
 3 3
  21
 .-	

 !4 3
 "z
 6
  21
 &
 *>
 #8"7
 #8"7
 '@&?
 (B'A
 $:#9
  (-!
" G#
$ "6!5'
 
0 !*C!* &4&:&:&<&<   ( N 	33N;4OQ[]deeerM   c                     | j         j        | j         j                            |          nd                    |          S )N )r   r   decodejoinr   s     rK   convert_tokens_to_stringz0PreTrainedTokenizerFast.convert_tokens_to_string  s@     %-9 "*11&999&!!	
rM   	token_idsclean_up_tokenization_spacesc                     |                     dd          | _        t          |t                    r|g}| j                            ||          }||n| j        }|r|                     |          }|S |S )Nuse_source_tokenizerF)r   )r_   rj   r|   r   rh   r-  r1  clean_up_tokenization)r   r0  r   r1  r   r   
clean_texts          rK   _decodezPreTrainedTokenizerFast._decode  s     -3JJ7Mu,U,U)i%% 	$"I%%iEX%YY ,7 )(2 	%
 ( 	33D99JKrM   save_directory
file_names.legacy_formatfilename_prefixc                     t          |          } j        |du rt          d          |du s|du o j        duo j        }|du p|du }|rt          j                            ||r|dz   ndt          z             } fd j        	                                D             }|rZt          |dd	
          5 }	t          j        |ddd          dz   }
|	                    |
           ddd           n# 1 swxY w Y                        ||          }||z   |fz   }|rOt          j                            ||r|dz   ndt          z             } j                            |           ||fz   }|S )z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- c                 2    i | ]\  }}|j         k    ||S rD   )r   )rI   tokrO   r   s      rK   r   z<PreTrainedTokenizerFast._save_pretrained.<locals>.<dictcomp>  s.    vvv*#u]bfjfu]u]u3]u]u]urM   wzutf-8)r      )indent	sort_keysensure_ascii
)r:  )rY   r)   ra   r   r   r   r.  ADDED_TOKENS_FILErx   rv   openr   dumpswritesave_vocabularyTOKENIZER_FILEr   save)r   r7  r8  r9  r:  	save_slow	save_fastadded_tokens_fileadded_vocabfout_strvocab_filesr%   s   `            rK   _save_pretrainedz(PreTrainedTokenizerFast._save_pretrained  s    ^,,$,$1F1F`   d";mt&; -)5-, 	
 "T)C]e-C	 	I "/!Q3!6!6rUf f! ! wvvv8Q8W8W8Y8YvvvK %+S7CCC %q"jQ$]bcccfjjGGGG$$$% % % % % % % % % % % % % % % ..~.__K#k15F4HHJ 	8W\\/!Q3!6!6rUc c N "''777#~&77Js   51C22C69C6c           	      p   t          j        | j                                                  }|                    d          }|                    d          }	d}
|d         d         dk    ri |d         d<   g |d         d<   n|d         d         d	k    r^|d         d
         O|d         d
         }|d         d         |         d         }
|
v r|
         }
d|d         d
<   |
dgg|d         d<   n;|d         d         dv ri |d         d<   nt          d|d         d          d          7d|d         v r-|d         d         v r|d         d                  |d         d<   t          j        t          j        |                    g }|D ]}|                    dd          }|                    dd          }|d         d         d	k    r|sC|d         v r|d                  |d<   |	                    t          d)i |           ||                    |           |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         d	k    r|
|
|d<   |d         t|d         d         dk    sA|d         d         dk    rPd|d         v rFt          d |d         d         D                       r!t          j                                        |d<   t           |d         d                  } |d)||d|}                    |||           |	,t          j                                                  }d|	v r|	d         D ]}|	d         |         d         }fd |D             }||	d         |         d<   |D ](}                    |          }|t          d!          )fd"|D             |	d         |         d#<   d$D ]L}||	v rF|	|         \  }}|v r|         }                    |          }|t          d!          ||g|	|<   M|	|d<   t          j        t          j        |                    | j                                        }t*          j                                        }|                    d%           |D ]}t1          | |          t1          | |          }|v r|         }| j                            |d          }t7          |t                    r-t          ||j        |j        |j        |j        d&'          ||<   |||<   | j         }||                    |           tC          |          dk    r||d%<    | j"        d)d(i|S )*uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelr^   r!   r   mergesr"   unk_idr   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenr]   idr   continuing_subword_prefixend_of_word_suffixr   	ByteLevelSequencepretokenizersc              3   .   K   | ]}|d          dk    V  dS )r^   r_  NrD   )rI   pretokenizers     rK   	<genexpr>zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>P  s@        $ !(K7     rM   initial_alphabet)r   r   )rA   trainerr   r   c                 <    g | ]}                     ||          S rD   r  )rI   rJ   special_tokens_maps     rK   rQ   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>b  s*    ![![![5"4"8"8"F"F![![![rM   zQAttempted to set a token in the post processor that does not exist in the mappingc                 :    g | ]}                     |          S rD   )r   )rI   rJ   r3   s     rK   rQ   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>k  s)    CuCuCuejIDYDYZ_D`D`CuCuCurM   r   )clssepr5   T)single_wordlstriprstrip
normalizedr]   r+   rD   )#r   r   rh   to_strr_   ra   rd   from_strrH  r}   r   extendanyr   r_  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorr   ri   rb   r   SPECIAL_TOKENS_ATTRIBUTESremover   _special_tokens_mapr`   r|   rl  rm  rn  ro  r5   rg   r   )r   text_iteratorr   rA   new_special_tokensrh  r   tokenizer_jsonrV  rW  r[  rZ  r   added_tokenr]   r  trainer_classrf  trained_tokenizer_jsonrW   r   rJ   token_idspecial_tokenspecial_tokens_listspecial_token_fullr5   r3   s        `                     @rK   train_new_from_iteratorz/PreTrainedTokenizerFast.train_new_from_iterator  s;   D DO$:$:$<$<==%)).99'++,<==	'"6*e33/1N7#G,02N7#H--G$V,	99g&x0<'0:*73G<VDQG	%1iCU6U6U 29 =I45w'15>4D3Ew'0G$V,0JJJ/1N7#G,,>n]dNeflNm > > >   *~g666w'48JJJ3EnU\F]^iFj3kN7#K0!*4:n+E+EFF	 ' 	= 	=K!ooi66Gd++Ag&v.);;G;!-+i2HL^2^2^);K	<R)SI&!!*";";{";";<<<<)!!"4555 7#F+u44+699w'(CDP2@2IJe2fF./7#F+u44$F22w'(<=I+9'+BCW+XF'('"6*i77I<Q"+F;/*6/7;FF!/26:jHH#~o'FFF  (6(G(X     G .A-J-S-S-U-U)*01H1PQ-_:n__X^__%%mFG%TTT%%)Z	0@0@0B0B%C%C">11)*:; v vC+,<=cB8LF)5![![![![TZ![![![FLN#34S9(C!'  #,#8#8#?#?#+", s# #  ,
 DvCuCuCuntCuCuCuN#34S9%@@!/ 
F 
F N22-m<HE1)5%CU:U:U 25 9(44U;;H'(o   6;H4EN=17E"#34%.tz:P/Q/QRRI!&&((0JOOQQ""#>???( 	2 	2EtU##/ 'e 4 4%1mGY6Y6Y$6}$EM%)%=%A%A%%N%N"0*== 2$.%$6$B1818#5#@ $% % %F5MM %2F5M$($B!)%,,-?@@@())A--2KF./t~CCyCFCCCrM   )NNFFFFT)F)NF)FN)NN)NNN)A__name__
__module____qualname____doc__VOCAB_FILES_NAMESr   r)   r   r^   r   __annotations__rr   propertyboolr   r   r   r   dictrY   r   r   rx   r   r/   r   r   r   rd   r   DecoderFastr   EncodingFastr  r   rw   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r%  r*  r/  r6  r   PathLikerT  r  __classcell__)r   s   @rK   r(   r(   Q   s        
 
 *@D(4(;#<=DDDz z z z zx     X     X GC G G G XGA4S> A A A A  tCH~       X  nd38n n n n Xn :d3
?&; : : : X:nc3h n n n n$    F F F F F =    X ' ' ' ' X' 1504*/+0',#-( -(-(  (~-(  (~	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-( -( -( -(^UE#x}2D,E U%PSUYZ]U^P^J_ U U U U      7# 7(3- 7 7 7 76 6d5j+A&B 6]` 6 6 6 6? ?d ?s ? ? ? ?, GL d3i(?C	sDI~	   8u uS u uRV umqrumv u u u uI9)I9 0I9 	I9
 I9 %SMI9 smI9 I9 I9 I9` $(,;,F2D2T$($),0&*(,0404*/+0',#%*+Y` Y`"'OT-0$7H2I4PeKff#
Y`
 !Y` *Y` 0Y` SMY` Y` "Y` %SMY` smY` !Y`  (~Y`  (~Y`  $(!Y`" %)#Y`$ !%%Y`& 'Y`( )Y`* #+Y`, 
-Y` Y` Y` Y`| DH#',;,F2D2T$($),0&*)-0404*/+0',#%*); ;I001; E)->">?@; !	;
 *; 0; SM; ; "; %SM; sm; !;  (~;  (~; $(;  %)!;" !%#;$ %;& ';( #);, 
-; ; ; ;z
tCy 
S 
 
 
 
 %*7;	 d3i( " '/tn	 
   8 )-)-/ /c2;.// #s(O/  ~	/
 "#/ 
sCx/ / / /j rD rD rD rD rD rD rD rDrM   r(   )=r  rb   r   r   collectionsr   collections.abcr   typingr   r   r   tokenizers.pre_tokenizerspre_tokenizersr   
tokenizersr   r  r	   rd   tokenizers.decodersr
   r  tokenizers.trainersr   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r    
get_loggerr  loggerrK  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILErF  ru  r  r(   rD   rM   rK   <module>r     s   
   				 # # # # # # $ $ $ $ $ $ ' ' ' ' ' ' ' ' ' ' 7 7 7 7 7 7 / / / / / / 1 1 1 1 1 1 6 6 6 6 6 6 ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ : : : : : : 5 5 5 5 5 5 = = = = = = 3 3 3 3 3 3                        @ ? ? ? ? ? ? ? ? ? 
	H	%	% "3 / '  (      !!	   (6EXYY  ,--HD HD HD HD HD5 HD HD .-HD HD HDrM   