
    *`i'                     T   d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZ d dlmZmZmZ d dlmZmZ  ej        ded	
            ej        ded
            e            rd dlmZ deez  defdZddeez  dedefdZdeez  dedz  fdZ G d de          ZdS )    N)cached_property)Path)TokenizerException)assert_sentencepiece_installedis_sentencepiece_installed)SpecialTokenPolicy	TokenizerTokenizerVersion)ImageConfigMultiModalVersiononcez%.*`get_control_token` is deprecated.*)actioncategorymessagez#.*`_control_tokens` is deprecated.*)SentencePieceProcessorpathreturnc                 <    t           t                    rt                      t          t          j                  }t          t          j                  dgz   fd|D             dgz   }                                 ot           fd|D                       S )z1Check if the given path is a SentencePiece model. c                 (    g | ]}D ]	}d | | 
S )z.model. ).0vmmm_versionss      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/mistral_common/tokens/tokenizers/sentencepiece.py
<listcomp>z$is_sentencepiece.<locals>.<listcomp>)   s4    QQQQ[QQ ! Q  QQQQ    z.modelc              3   L   K   | ]}j                             |          V  d S N)nameendswith)r   suffixr   s     r   	<genexpr>z#is_sentencepiece.<locals>.<genexpr>+   s3      !T!T$)"4"4V"<"<!T!T!T!T!T!Tr   )	
isinstancestrr   listr
   __members__r   is_fileany)r   instruct_versionssuffixesr   s   `  @r   is_sentencepiecer-   "   s    $ Dzz-9::(455<KQQQQ*;QQQU]T^^H<<>>Tc!T!T!T!T8!T!T!TTTTr   Ftokenizer_filenameraise_deprecatedc                 J   t          |           } |                     d          d         }|dk    r|                    d          d         }|dk    r$|rt          d|  d          t          d          S |t          j        vrt          d	|            t          |          S )
z3Get the version of the tokenizer from the filename..modelr   r   z4Make sure to rename your tokenizer file to end with z.v1.v1!Unrecognized tokenizer filename: )r&   splitr   r
   r(   )r.   r/   _version_strs      r   get_spm_versionr8   .   s    /00%++C004Lw#))#..q1w 	v$%t\n%t%t%tuuu  %%%+777 !YEW!Y!YZZZL)))r   c                    t          |           } |                     d          d         }|dk    sd|vrdS d|                    d          d         z   }|t          j        vrt	          d|            t          |          j        S )z1Get the image config from the tokenizer filename.r1   r2   r3   r   Nr5   )r&   r6   r   r(   r   config)r.   r7   _mm_version_strs      r   get_image_configr<   C   s    /00%++C004Lw#\"9"9tL..s33B77O/;;; !YEW!Y!YZZZ_--44r   c            	       <    e Zd ZdZd deez  dedz  ddf fdZedefd            Z	edefd            Z
d	edefd
Zd	edefdZedefd            Zdee         fdZedefd            Zedefd            Zdeej        z  ez  defdZedee         fd            Zd	edededee         fdZd dee         dedz  defdZdedefdZdee         dedefdZdee         defdZdee         defdZ edefd            Z!edefd            Z" xZ#S )!SentencePieceTokenizerzC[SentencePiece](https://github.com/google/sentencepiece) tokenizer.N
model_pathtokenizer_versionr   c                     t                       t          j         j        j                   _        t          j                            |          s
J |            t          t          |t                    r|n|                                           _         j                                         j                                        k    sJ  fdt!           j                  D              _        |pt'          |d           _        t+          |           _        t/                                                       dS )zInitialize the `SentencePieceTokenizer`.

        Args:
            model_path: The path to the `SentencePiece` model.
            tokenizer_version: The version of the tokenizer. If not provided, it will be inferred from the model path.
        )
model_filec                 D    g | ]}j                             |          S r   _modelid_to_piece)r   iselfs     r   r   z3SentencePieceTokenizer.__init__.<locals>.<listcomp>g   s)    OOOat{..q11OOOr   F)r/   N)r   logging	getLogger	__class____name___loggerosr   isfiler   r%   r&   as_posixrE   
vocab_sizeget_piece_sizerangen_words_vocabr8   _versionr   
_file_pathsuper__init__)rH   r?   r@   rK   s   `  r   rY   zSentencePieceTokenizer.__init__V   s    	'(((()@AAw~~j))55:55),%/
C%@%@[zzjFYFYF[F[
 
 
 {%%''4;+E+E+G+GGGGGOOOO5;N;NOOO*;*rzlq?r?r?rz**r   c                     | j         S )z The path to the tokenizer model.)rW   rH   s    r   	file_pathz SentencePieceTokenizer.file_pathn   s     r   c                     | j         S )zThe version of the tokenizer.)rV   r[   s    r   versionzSentencePieceTokenizer.versions   s     }r   sc                 6    | j                             |          S )z+Get the special token for the given string.)rE   piece_to_idrH   r_   s     r   get_special_tokenz(SentencePieceTokenizer.get_special_tokenx   s    {&&q)))r   c                 `    t          j        dt                     |                     |          S )NzC`get_control_token` is deprecated. Use `get_special_token` instead.)warningswarnFutureWarningrc   rb   s     r   get_control_tokenz(SentencePieceTokenizer.get_control_token|   s*    []jkkk%%a(((r   c                 4    | j                                         S )z!Vocabulary size of the tokenizer.)rE   rQ   r[   s    r   rT   zSentencePieceTokenizer.n_words   s     {%%'''r   c                     | j         S )z(All tokens in the vocabulary as strings.)rU   r[   s    r   vocabzSentencePieceTokenizer.vocab   s
    {r   c                 4    | j                                         S )z#The beginning of sentence token id.)rE   bos_idr[   s    r   rm   zSentencePieceTokenizer.bos_id        {!!###r   c                 4    | j                                         S )zThe end of sentence token id.)rE   eos_idr[   s    r   rp   zSentencePieceTokenizer.eos_id   rn   r   tokenc                 l   t          |t          t          j        f          r'| j                            t          |                    S t          |t                    r4| j                            |          }| j                            |          S t          dt          |          j
                   )z7Return `True` if the passed `token` is a special token.zExpected int or str, got )r%   intnpintegerrE   	IsControlr&   ra   	TypeErrortyperL   )rH   rq   	token_ints      r   
is_specialz!SentencePieceTokenizer.is_special   s    ec2:.// 	P;((U444s## 	P//66I;((333NU8LNNOOOr   c                 x     t          j        dt                      fdt           j                  D             S )NzB`_control_tokens` is deprecated. Make use of `is_special` instead.c                 H    h | ]}j                             |          |S r   )rE   rv   r   tokrH   s     r   	<setcomp>z9SentencePieceTokenizer._control_tokens.<locals>.<setcomp>   s.    QQQdk6K6KC6P6PQQQQr   )re   rf   rg   rS   rT   r[   s   `r   _control_tokensz&SentencePieceTokenizer._control_tokens   s;    Z\ijjjQQQQuT\22QQQQr   boseosc                     t          |t                    sJ | j                            |          }|r
| j        g|}|rg || j        }|S )a  Encode the given string into a list of token ids.

        Args:
            s: The string to encode.
            bos: Whether to add the beginning of sentence token.
            eos: Whether to add the end of sentence token.

        Returns:
            The list of token ids.
        )r%   r&   rE   encoderm   rp   )rH   r_   r   r   ts        r   r   zSentencePieceTokenizer.encode   sd     !S!!!!!{))!,, 	"!q!A 	"!!!T[!Ar   tokensspecial_token_policyc                 T   |5t          |t                    s t          dt          |           d          |&t	          j        dt                     t          j        }|t          j        t          j	        fv r| 
                    ||          S | j                            |          S )aa  Decode the given list of token ids into a string.

        Note:
            Using `special_token_policy=SpecialTokenPolicy.KEEP` will keep the special tokens and the normal tokens as
            SentencePiece pieces.

        Args:
            tokens: The list of token ids.
            special_token_policy: The policy to use for special tokens. If `None`, the default policy
                is `SpecialTokenPolicy.IGNORE`.  Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        NzFExpected `special_token_policy` to be None or SpecialTokenPolicy, got r1   zUsing the tokenizer's special token policy `None` is deprecated. It will be removed in 1.10.0. Please pass a special token policy explicitly. Future default will be SpecialTokenPolicy.IGNORE.)r%   r   
ValueErrorrx   re   rf   rg   IGNOREKEEPRAISE_decode_with_special_tokensrE   decode)rH   r   r   s      r   r   zSentencePieceTokenizer.decode   s       +J?SUg4h4h+vY]^rYsYsvvv    'MH
    $6#< $6$;=O=U#VVV33F<PQQQ{!!&)))r   token_idc                 6    | j                             |          S )z,Convert the given token id to a token piece.rD   )rH   r   s     r   rF   z"SentencePieceTokenizer.id_to_piece   s    {&&x000r   c                     g }g }|D ]}                      |          rm|t          j        k    rt          d          |r#|                     fd|D                        g }|                                         |                     |                    |           |r!|                     fd|D                        d                    |          S )NzNDecoding `tokens` that contain special tokens with special_token_policy=RAISE.c                 :    g | ]}                     |          S r   rF   r}   s     r   r   zFSentencePieceTokenizer._decode_with_special_tokens.<locals>.<listcomp>   s'    %S%S%Sd&6&6s&;&;%S%S%Sr   c                 :    g | ]}                     |          S r   r   r}   s     r   r   zFSentencePieceTokenizer._decode_with_special_tokens.<locals>.<listcomp>   s'    KKKd..s33KKKr   r   )rz   r   r   r   extendappendrF   join)rH   r   r   	text_listcurr_tokensr~   s   `     r   r   z2SentencePieceTokenizer._decode_with_special_tokens   s   	!# 	( 	(Cs## 
('+=+CCC$%uvvv %$$%S%S%S%S{%S%S%STTT"$K  !1!1#!6!67777 ""3'''' 	MKKKK{KKKLLLwwy!!!r   c                 `    t          j        dt                     |                     |          S )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

        Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

        This is a convenient method for debugging.
        z`to_string` is deprecated and will be removed in 1.10.0. Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.)re   rf   rg   
_to_stringrH   r   s     r   	to_stringz SentencePieceTokenizer.to_string   s7     	\ 	
 	
 	
 v&&&r   c                 D    |                      |t          j                  S )N)r   )r   r   r   r   s     r   r   z!SentencePieceTokenizer._to_string  s    {{68J8O{PPPr   c                 4    | j                                         S )zThe padding token id.)rE   pad_idr[   s    r   r   zSentencePieceTokenizer.pad_id  rn   r   c                 4    | j                                         S )zThe unknown token id.)rE   unk_idr[   s    r   r   zSentencePieceTokenizer.unk_id  rn   r   r    )$rL   
__module____qualname____doc__r&   r   r
   rY   propertyr\   r^   rs   rc   rh   rT   r'   rk   r   rm   rp   rt   ru   boolrz   setr   r   r   r   rF   r   r   r   r   r   __classcell__)rK   s   @r   r>   r>   S   sT       NN 3: BRUYBY ei      0 4    X )    X*3 *3 * * * *)3 )3 ) ) ) ) ( ( ( ( X(tCy     $ $ $ $ _$ $ $ $ $ _$Pbj 03 6 P4 P P P P RS R R R _R $ T d3i    &$* $*T#Y $*>PSW>W $*cf $* $* $* $*L1C 1C 1 1 1 1"$s) "Se "jm " " " "*'S	 'c ' ' ' ' Qc Qs Q Q Q Q $ $ $ $ X$ $ $ $ $ X$ $ $ $ $r   r>   )F)rI   rN   re   	functoolsr   pathlibr   numpyrt   mistral_common.exceptionsr   mistral_common.importsr   r   %mistral_common.tokens.tokenizers.baser   r	   r
   &mistral_common.tokens.tokenizers.imager   r   filterwarningsrg   sentencepiecer   r&   r   r-   r8   r<   r>   r   r   r   <module>r      s     				  % % % % % %           8 8 8 8 8 8 ] ] ] ] ] ] ] ]         
 R Q Q Q Q Q Q Q  4   
  2     5444444	U3: 	U$ 	U 	U 	U 	U* *d
 *d *Wg * * * **5t 5d8J 5 5 5 5 |$ |$ |$ |$ |$Y |$ |$ |$ |$ |$r   