
    *`iY              	          d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlZd dlZd dlmZmZ d dlmZmZmZmZ d dlmZ  ej        d	ed
            ej        e          Zdee	z  defdZ  G d de          Z! G d de          Z" G d de          Z# G d de          Z$ G d de          Z%	 dde&e!         de'dz  de(e)e'f         fdZ*dS )    N)cached_property)groupby)Path)	TypedDict)AudioConfigAudioSpectrogramConfig)SpecialTokenPolicySpecialTokens	TokenizerTokenizerVersion)ImageConfigoncez%.*`get_control_token` is deprecated.*)actioncategorymessagepathreturnc                     t          | t                    rt          |           } |                                 od| j        v o
| j        dk    S )z3Check if the given path is a tekken tokenizer file.tekken.json)
isinstancestrr   is_filenamesuffix)r   s    {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/mistral_common/tokens/tokenizers/tekken.py	is_tekkenr       sE    $ Dzz<<>>Nh$)3Nw8NN    c                   8    e Zd ZU dZeed<   eed<   edz  ed<   dS )	TokenInfozToken information in the JSON file.

    Attributes:
        rank: The rank of the token.
        token_bytes: The token in bytes, base64 encoded.
        token_str: The token in string format.
    ranktoken_bytesN	token_str)__name__
__module____qualname____doc__int__annotations__r    r   r   r    r    (   sC           IIITzr   r    c                   2    e Zd ZU dZeed<   eed<   eed<   dS )SpecialTokenInfozSpecial token information in the JSON file.

    Attributes:
        rank: The rank of the token.
        token_str: The token in string format.
        is_control: Whether the token is a control token.
    r!   r#   
is_controlN)r$   r%   r&   r'   r(   r)   r   boolr*   r   r   r,   r,   6   s<           IIINNNr   r,   c                   F    e Zd ZU dZeed<   eed<   eed<   eed<   eed<   dS )TekkenConfigaX  Tekken configuration in the JSON file.

    Attributes:
        pattern: The pattern of the tokenizer.
        num_vocab_tokens: The number of vocabulary tokens.
        default_vocab_size: The default vocabulary size.
        default_num_special_tokens: The default number of special tokens.
        version: The version of the tokenizer.
    patternnum_vocab_tokensdefault_vocab_sizedefault_num_special_tokensversionN)r$   r%   r&   r'   r   r)   r(   r*   r   r   r0   r0   D   sR           LLL ####LLLLLr   r0   c                   x    e Zd ZU dZee         ed<   ee         dz  ed<   eed<   e	ed<   e
ed<   eed<   eed	<   dS )
	ModelDataa2  The data of the tekken tokenizer model.

    Attributes:
        vocab: The vocabulary of the tokenizer.
        config: The configuration of the tokenizer.
        version: The version of the tokenizer.
        type: The type of the tokenizer.
        image: The image configuration of the tokenizer.
    vocabNspecial_tokensconfigr5   typeimageaudio)r$   r%   r&   r'   listr    r)   r,   r0   r(   r   r   r   r*   r   r   r7   r7   V   s{           	?)*T1111LLL
IIIr   r7   c                      e Zd ZdZ edej        d           edej        d           edej        d           edej	        d           edej
        d           ed	ej        d           ed
ej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d           edej        d          fZdZddddddee         dee         deded ed!e d"ed#ee!z  dz  d$e"dz  d%e#dz  fd&Z$e%d'e!fd(            Z&e'd)e(d          d*ee!z  d'd fd+            Z)e%d'e"dz  fd,            Z*e*j+        d-e"d'dfd.            Z*e%d'e#dz  fd/            Z,e,j+        d-e#d'dfd0            Z,e%d'efd1            Z-e%d'efd2            Z.e%d'e fd3            Z/e%d'e0fd4            Z1e1j+        d5e0d'dfd6            Z1e2d'efd7            Z3e2d'efd8            Z4e2d'efd9            Z5e2d'efd:            Z6d'ee         fd;Z7d<ed=e8d>e8d'ee         fd?Z9d@ee         dAe0d'ee         fdBZ:dCed'e8fdDZ;d<ed'efdEZ<dFee=j>        z  ez  d'e8fdGZ?d<ed'efdHZ@dNd@ee         dAe0dz  d'efdIZAd@ee         d'efdJZBd@ee         d'efdKZCdCed'efdLZDdNdCedAe0dz  d'eEfdMZFdS )O
TekkenizerzTekken tokenizer.

    This tokenizer is based on the [tiktoken](https://github.com/openai/tiktoken) library. It fastens the tokenization
    for multiple languages.
    r   Tr!   r#   r-                           	   
                              z<SPECIAL_{id}>
tekkenizerN)r   _pathimage_configaudio_configr8   r9   r1   
vocab_sizenum_special_tokensr5   r   rV   rW   rX   c                    |t          |          |z   k    sJ |t          |          |f            | _        t          t          d |D                                 }t          |          |k    sJ d|             t          |          |k    sJ  fdt          t          |          |          D             }|r8t                              d|d         d          d|d         d                     ||z   }t          t          d	 |D                                 t          |          cxk    r|k    sn J |            ||z
  }t                              d
| d| d           t          ||           _        t          t          |                    t           j                                                  k    sJ | j        f            t          j
        || j        i            _        | _        |	 _        |
 _        | _        d |D              _        d |D              _         fdt          |          D              _        t&          j         _        |t-          |          nd _        dS )a  Initialize the tekken tokenizer.

        Args:
            vocab: The vocabulary of the tokenizer.
            special_tokens: The special tokens of the tokenizer.
            pattern: The pattern of the tokenizer.
            vocab_size: The vocabulary size of the tokenizer.
            num_special_tokens: The number of special tokens of the tokenizer.
            version: The version of the tokenizer.
            name: The name of the tokenizer.
            image_config: The image configuration of the tokenizer.
        c                     g | ]
}|d          S r#   r*   .0ts     r   
<listcomp>z'Tekkenizer.__init__.<locals>.<listcomp>   s    -U-U-Uan-U-U-Ur   zSpecial tokens must be unique: c                 f    g | ]-}t          |j                            |           d          .S ))idTrA   )r,   SPECIAL_TOKEN_TEMPLATEformatr_   iselfs     r   ra   z'Tekkenizer.__init__.<locals>.<listcomp>   sN     
 
 
 !t/J/Q/QUV/Q/W/Wdhiii
 
 
r   zAdding special tokens r   r#   z, ..., c                     g | ]
}|d          S r]   r*   r^   s     r   ra   z'Tekkenizer.__init__.<locals>.<listcomp>   s    ???1+???r   zNon special vocabulary size is z with z special tokens.)	max_vocab)r   pat_strmergeable_ranksr9   c                     h | ]
}|d          S )r!   r*   r^   s     r   	<setcomp>z&Tekkenizer.__init__.<locals>.<setcomp>   s    "E"E"E1V9"E"E"Er   c                 ,    i | ]}|d          |d         S )r#   r!   r*   r^   s     r   
<dictcomp>z'Tekkenizer.__init__.<locals>.<dictcomp>   s"    -`-`-`Aanai-`-`-`r   c                 :    g | ]}                     |          S r*   )id_to_piecerf   s     r   ra   z'Tekkenizer.__init__.<locals>.<listcomp>   s'    FFFqt''**FFFr   N)len_vocab_sizesetrangeloggerinfo_reload_mergeable_ranks_tekken_token2id_nospecialvaluestiktokenEncoding_model_version_image_config_audio_config_all_special_tokens_special_token_ids_special_tokens_reverse_vocab_vocabr	   IGNORE_special_token_policyr   
_file_path)rh   r8   r9   r1   rY   rZ   r5   r   rV   rW   rX   num_defined_special_tokensspecial_fillerinner_vocab_sizes   `             r   __init__zTekkenizer.__init__   s   4 SZZ*<<<<<JJ?
<<<
 & &)-U-Un-U-U-U)V)V%W%W">""&@@@@BtdrBtBt@@@>""&88888
 
 
 
3~..0BCC
 
 
  	KKq):;)GqqP^_aPbcnPoqq   (.83?????@@AASEXEXnnnn\nnnnnn onn &(:: 	r6FrrN`rrrsss*A%Sc*d*d*d'5)**++s43R3Y3Y3[3[/\/\\\\+_
\\\ ' ;	
 
 
  ))#1 "E"En"E"E"E-`-`Q_-`-`-`*FFFFE*4E4EFFF%7%>").):$u+++r   r   c                 <    | j         t          d          | j         S )zThe path to the tokenizer file.Nz)The tokenizer was not loaded from a file.)r   
ValueErrorrh   s    r   	file_pathzTekkenizer.file_path   s#     ?"HIIIr   clsr   c                 2   t          |t                    rt          |          }|                                s
J |            t	          |dd          5 }t          j        |          }ddd           n# 1 swxY w Y   |d                             d          }|t          j	        vr/t          d| d| d	t          t          j	                             |J t          |          }|                    d
d          }|@|t          d          k    rt          d| d          t          t          j                  }nd |D             }||d
<   |                    d          x}r>|t          d          k    rt          d| d|j         d          t          di ||d<   n&|                    d          x}	rt          di |	|d<   |                    d          x}
r2|
                    d          }t#          di |}t%          dd|i|
|d<   |} | |d         ||d         d         |d         d         |d         d         ||j                            dd          |                    d          |                    d          |
  
        S ) zLoad the tekken tokenizer from a file.

        Args:
            path: The path to the tokenizer file.

        Returns:
            The tekken tokenizer.
        rutf-8)encodingNr:   r5   zUnknown version: z in z+. Make sure to use a valid version string: r9   v7zSpecial tokens not found in zL. Please update your tokenizer file and include all special tokens you need.c                     g | ]}|S r*   r*   )r_   tokens     r   ra   z(Tekkenizer.from_file.<locals>.<listcomp>  s    FFFeFFFr   
multimodalv11z-The image config has to be called 'image' in z for tokenizers of version .r<   r=   audio_encoding_configencoding_configr8   r1   r3   r4   r    )
r8   r9   r1   rY   rZ   r5   r   rW   rX   rV   r*   )r   r   r   existsopenjsonloadgetr   __members__r   r>   r@   DEPRECATED_SPECIAL_TOKENSvaluer   popr   r   r   replace)r   r   funtyped_version_strr5   special_tokens_dictsr9   mmr<   r=   r   
model_datas                r   	from_filezTekkenizer.from_file   s`    dC   	::D{{}}""d""}$g... 	#!illG	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# x(,,Y77/;;;aL a ad a a<@AQA]<^<^a a  
 '''"<00>EkkJZ\`>a>a')$//// a4 a a a  
 "&j&J!K!KFF1EFFFN$2 !\***2 		4)%0000 uDuueleruuu    +00R00GGkk'***U 	4*33U33GGKK(((5 	U#ii(?@@O4GGGGO*TT?TeTTGG '
sW%)x(3!(+,@A)(34PQ""7B//#00#00
 
 
 	
s   A66A:=A:c                     | j         S )z)The image configuration of the tokenizer.)r   r   s    r   r<   zTekkenizer.image+  s     !!r   r   c                      t          d          )Nz!Can only set Image config at initr   rh   r   s     r   r<   zTekkenizer.image0      <===r   c                     | j         S )zThe audio configuration of the tokenizer.

        Returns:
             The audio configuration object if it exists, otherwise None.
        )r   r   s    r   r=   zTekkenizer.audio4  s     !!r   c                      t          d          )Nz!Can only set Audio config at initr   r   s     r   r=   zTekkenizer.audio=  r   r   c                 *    t          | j                  S )z.The number of special tokens of the tokenizer.)rt   r   r   s    r   rZ   zTekkenizer.num_special_tokensA  s     4+,,,r   c                     | j         S )z!Vocabulary size of the tokenizer.)ru   r   s    r   n_wordszTekkenizer.n_wordsF  s     r   c                     | j         S )zThe version of the tokenizer.)r   r   s    r   r5   zTekkenizer.versionK  s     }r   c                     | j         S )z'The policy for handling special tokens.)r   r   s    r   special_token_policyzTekkenizer.special_token_policyP  s     ))r   policyc                     t          |t                    s t          dt          |           d          t	          j        dt                     || _        dS )z+Set the policy for handling special tokens.z!Expected SpecialTokenPolicy, got r   zThe attributed `special_token_policy` is deprecated and will be removed in 1.10.0. Please pass a special token policy explicitly to the relevant methods.N)r   r	   r   r;   warningswarnFutureWarningr   )rh   r   s     r   r   zTekkenizer.special_token_policyU  sg     &"455 	RPfPPPQQQY 	
 	
 	
 &,"""r   c                 ,    |                      d          S )z#The beginning of sentence token id.z<s>get_special_tokenr   s    r   bos_idzTekkenizer.bos_ide  s     %%e,,,r   c                 ,    |                      d          S )zThe end of sentence token id.z</s>r   r   s    r   eos_idzTekkenizer.eos_idj  s     %%f---r   c                 ,    |                      d          S )zThe padding token id.z<pad>r   r   s    r   pad_idzTekkenizer.pad_ido       %%g...r   c                 ,    |                      d          S )zThe unknown token id.z<unk>r   r   s    r   unk_idzTekkenizer.unk_idt  r   r   c                     | j         S )a2  All tokens in the vocabulary as strings.

        Note:
           This will collapse all tokens for which we have a decoding error into
           the <?> string. This is bad and results in things like len(set(vocab)) != len(vocab)).

        Returns:
            The vocabulary of the tokenizer.
        )r   r   s    r   r8   zTekkenizer.vocaby  s     {r   sboseosc                       j                             |          } fd|D             }|r
 j        g|}|rg | j        }|S )a  Encode a string into a list of token ids.

        Args:
            s: The string to encode.
            bos: Whether to add the beginning of sentence token.
            eos: Whether to add the end of sentence token.

        Returns:
            The list of token ids.
        c                 $    g | ]}|j         z   S r*   rZ   r_   r`   rh   s     r   ra   z%Tekkenizer.encode.<locals>.<listcomp>  s!    >>>!!d-->>>r   )r   encoder   r   )rh   r   r   r   tokenss   `    r   r   zTekkenizer.encode  sh     !K..q11>>>>v>>> 	,k+F+F 	,+v+t{+Fr   r   r   c                     g }t          | fd          D ]\  }}|rt|t          j        k    r t          dt	          |           d          |t          j        k    r"|                     fd|D                        i|t          j        k    rz{|                     j	        
                     fd|D                                  |S )Nc                     | j         k     S Nr   )r`   rh   s    r   <lambda>z(Tekkenizer._decode_all.<locals>.<lambda>  s    1t?V;V r   z/Decoding `tokens` that contain special tokens (a  ) is not allowed. 
Either make sure `tokens` do not include any special tokens or, if you want to decode `tokens` that includes special tokens, change the tokenizer's special token policy to IGNORE or KEEP: 
```
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import SpecialTokenPolicy

tokenizer = MistralTokenizer.v3(is_tekken=True)
tekken = tokenizer.instruct_tokenizer.tokenizer
tekken.special_token_policy = SpecialTokenPolicy.IGNORE  # or SpecialTokenPolicy.KEEP
```c              3   >   K   | ]}j         |         d          V  dS )r#   N)r   r   s     r   	<genexpr>z)Tekkenizer._decode_all.<locals>.<genexpr>  s1      "["[PQ4#;A#>{#K"["["["["["[r   c                 $    g | ]}|j         z
  S r*   r   r   s     r   ra   z*Tekkenizer._decode_all.<locals>.<listcomp>  s"    2^2^2^ST1t7N3N2^2^2^r   )r   r	   RAISEr   r>   KEEPextendr   appendr   decode)rh   r   r   decoded
is_specialgroups   `     r   _decode_allzTekkenizer._decode_all  s   !(1V1V1V1V!W!W 	a 	aJ a'+=+CCC$	 $u++ 	  	  	    *-?-DDDNN"["["["[UZ"["["[[[[[)-?-FFF G
 t{112^2^2^2^X]2^2^2^__````r   token_idc                 2    d|| j         z
  cxk    odk     nc S )z$Check if a token id is a byte token.r      r   rh   r   s     r   is_bytezTekkenizer.is_byte  s,    Ht66<<<<<<<<<r   c                 R    || j         v r| j         |         S t          d|           )z$Get the token id of a special token.zUnknown control token )r   r   rh   r   s     r   r   zTekkenizer.get_special_token  s5    2225a889a99:::r   r   c                     t          |t          t          j        f          r	|| j        v S t          |t
                    r	|| j        v S t          dt          |          j	                   )z7Return `True` if the passed `token` is a special token.zExpected int or str, got )
r   r(   npintegerr   r   r   	TypeErrorr;   r$   )rh   r   s     r   r   zTekkenizer.is_special  sk    ec2:.// 	PD333s## 	PD>>>NU8LNNOOOr   c                 `    t          j        dt                     |                     |          S )NzC`get_control_token` is deprecated. Use `get_special_token` instead.)r   r   r   r   r   s     r   get_control_tokenzTekkenizer.get_control_token  s*    []jkkk%%a(((r   c                    |5t          |t                    s t          dt          |           d          |*t	          j        d| j         dt                     | j        }d                    | 	                    ||                    S )a  Decode a list of token ids into a string.

        Args:
            tokens: The list of token ids to decode.
            special_token_policy: The policy for handling special tokens.
                Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
                if `None`. Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        NzFExpected `special_token_policy` to be None or SpecialTokenPolicy, got r   ,Using the tokenizer's special token policy () is deprecated. It will be removed in 1.10.0. Please pass a special token policy explicitly. Future default will be SpecialTokenPolicy.IGNORE.r   r   )
r   r	   r   r;   r   r   r   r   joinr   )rh   r   r   s      r   r   zTekkenizer.decode  s      +J?SUg4h4h+vY]^rYsYsvvv    'MH4C] H H H
    $(#= wwt''EY'ZZ[[[r   c                 `    t          j        dt                     |                     |          S )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

        Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

        This is a convenient method for debugging.
        z`to_string` is deprecated and will be removed in 1.10.0. Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.)r   r   r   
_to_stringrh   r   s     r   	to_stringzTekkenizer.to_string  s7     	\ 	
 	
 	
 v&&&r   c                 D    |                      |t          j                  S )Nr   r   r	   r   r   s     r   r   zTekkenizer._to_string  s    {{68J8O{PPPr   c                 F    |                      |gt          j                  S )z0Convert a token id to its string representation.r   r   r   s     r   rs   zTekkenizer.id_to_piece  s    {{H:<N<S{TTTr   c                    |*t          j        d| j         dt                     | j        }|| j        k     r||t
          j        k    r&| j        |         d                             d          S |t
          j	        k    rt          | d          |t
          j        k    rdS t          d|           | j                            || j        z
            S )	a  Convert a token id to its byte representation.

        Args:
            token_id: The token id to convert.
            special_token_policy: The policy for handling special tokens.
                Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
                if `None`. Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The byte representation of the token.
        Nr   r   r#   r   z is a special tokenr   zUnknown special token policy )r   r   r   r   rZ   r	   r   r   r   r   r   r   r   decode_single_token_bytes)rh   r   r   s      r   id_to_byte_piecezTekkenizer.id_to_byte_piece  s      'MH4C] H H H
    $(#= d---#'9'>>>/9+FMMgVVV%);)AAA H!A!A!ABBB%);)BBBs !WAU!W!WXXX{44X@W5WXXXr   r   )Gr$   r%   r&   r'   r,   r
   unkr   r   
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddler   begin_system
end_systembegin_tool_contentr   rd   r>   r    r   r(   r   r   r   r   r   propertyr   classmethodr;   r   r<   setterr=   rZ   r   r5   r	   r   r   r   r   r   r   r8   r.   r   r   r   r   r   r   r   r   r   r   r   rs   bytesr  r*   r   r   r@   r@   j   s         	a=+<NNNa=+<NNNa=+<NNNa=+CPTUUUa=+AdSSSa=+DQUVVVa=+BtTTTa=+KX\]]]a=+IVZ[[[a=+CPTUUUbM,=$OOObM,=$OOObM,CPTUUUbM,AdSSSbM,@TRRRbM,@TRRRbM,@TRRRbM,FSWXXXbM,DQUVVVbM,LY]^^^)!. . !#'+/+/NE NE NEINE -.NE 	NE
 NE  NE "NE NE TzD NE "D(NE "D(NE NE NE NE` 4    X E
tL) E
t E
 E
 E
 E
 [E
N "{T) " " " X" \>; >4 > > > \> "{T) " " " X" \>; >4 > > > \> -C - - - X-         X  )    X *&8 * * * X*  ,+= ,$ , , , ! , - - - - _- . . . . _. / / / / _/ / / / / _/tCy    " $ T d3i    &$s) CU Z^_bZc    8= = = = = =;3 ;3 ; ; ; ;Pbj 03 6 P4 P P P P)3 )3 ) ) ) )\ \T#Y \>PSW>W \cf \ \ \ \@'S	 'c ' ' ' ' Qc Qs Q Q Q QUC UC U U U U#Y #Y #YDVY]D] #Yin #Y #Y #Y #Y #Y #Yr   r@   r8   rk   c                    |tt          |           |k    sJ t          |           |f            t          |           |k    r5| d|         } t                              dt          |            d           i }t          |           D ]z\  }}|                                h dk    sJ |d         |k    sJ t          j        |d                   }|dk    s |t          |g          k    sJ ||f            |d         ||<   {t          |          t          |           k    sJ t          |	                                          t          t          t          |                              k    sJ |S )zAReload our tokenizer JSON file and convert it to Tiktoken format.Nz(Cutting non special vocabulary to first z tokens.>   r!   r#   r"   r!   r"   r   )rt   rx   ry   	enumeratekeysbase64	b64decoder  rv   r|   rw   )r8   rk   ranksrg   xmerges         r   rz   rz   .  sv   
 5zzY&&&UY(?&&&u::	!!*9*%EKKW3u::WWWXXX !E%   ! !1vvxx???????yA~~~~ =!122Cxx5E1#JJ...E
...ye u::U####u||~~#eCJJ&7&7"8"88888Lr   r   )+r  r   loggingr   	functoolsr   	itertoolsr   pathlibr   typingr   numpyr   r}   &mistral_common.tokens.tokenizers.audior   r   %mistral_common.tokens.tokenizers.baser	   r
   r   r   &mistral_common.tokens.tokenizers.imager   filterwarningsr   	getLoggerr$   rx   r   r.   r   r    r,   r0   r7   r@   r>   r(   dictr  rz   r*   r   r   <module>r-     s       % % % % % %                        V V V V V V V V            ? > > > > >  4    
	8	$	$OC$J O4 O O O O    	       y       9   $    	   (AY AY AY AY AY AY AY AYL ! 	?Tz 
%*     r   