
    *`iH                     ,   U d dl Z d dlmZ d dlmZmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ d dlmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= de+de:e6z  de,fdZ>dede:de fdZ? G d deeeeee(f                   Z@i dd de@jA        dd de@jB        d e@jA        d!e@jB        d"e@jA        d#d$ d%e@jC        d&e@jA        d'e@jB        d(e@jD        d)d* d+e@jB        d,e@jB        d-d. d/e@jC        e@jC        e@jC        e@jA        e@jA        e@jB        e@jB        e@jB        e@jB        d0 d1 d2 d3ZEeFeGeg e@f         f         eHd4<   dS )5    N)Path)AnyCallableGeneric)TokenizerException)
FIMRequest)UATSAssistantMessageTypeSystemMessageTypeToolMessageTypeUserMessageType)InstructRequestNormalizer normalizer_for_tokenizer_version)ChatCompletionRequest)MistralRequestValidatorMistralRequestValidatorV3MistralRequestValidatorV5MistralRequestValidatorV13ValidationMode)TranscriptionRequest)AudioConfigAudioEncoderSpecialAudioIDs)InstructRequestInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokensTokenizedTypeTokenizerVersion)ImageConfigImageEncoderSpecialImageIDs)InstructTokenizerV1InstructTokenizerV2InstructTokenizerV3InstructTokenizerV7InstructTokenizerV11InstructTokenizerV13)SentencePieceTokenizerget_image_configis_sentencepiece)
Tekkenizer	is_tekken)download_tokenizer_from_hf_hubimage_config	tokenizerreturnc                    t          |                    t          j        j                  |                    t          j        j                  |                    t          j        j                            }t          | |          S )zLoad a image encoder from a config and a tokenizer.

    Args:
        image_config: The image config.
        tokenizer: The tokenizer.

    Returns:
        The image encoder.
    )img	img_breakimg_end)r#   get_special_tokenr   r4   valuer5   r6   r"   )r0   r1   special_idss      |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/mistral_common/tokens/tokenizers/mistral.pyload_image_encoderr;   :   so     "''(9(?@@--m.E.KLL++M,A,GHH  K
 k222    audio_configc                    dt           dt          dz  ffd}t           |t          j        j                   |t          j        j                   |t          j        j                            }t          | |          S )zLoad a audio encoder from a config and a tokenizer.

    Args:
        audio_config: The audio config.
        tokenizer: The tokenizer.

    Returns:
        The audio encoder.
    tokenr2   Nc                 \                         |           sd S                     |           S N)
is_specialr7   )r?   r1   s    r:   get_special_token_or_nonez5load_audio_encoder.<locals>.get_special_token_or_noneW   s2    ##E** 	4**5111r<   )audiobegin_audiostreaming_pad)	strintr   r   rD   r8   rE   rF   r   )r=   r1   rC   r9   s    `  r:   load_audio_encoderrI   L   s    2 2t 2 2 2 2 2 2 "''(;(ABB--m.G.MNN//0K0QRR  K
 k222r<   c                   l   e Zd ZdZdeeeeef         de	e
eeef         dee
eeeef         fdZdeeeedf         f         fdZedefd	            Zed'd
            Zed'd            Zed(dededd fd            Zed)dedd fd            Zed)dededd fd            Zeddddej         fdedeez  dz  dedz  dedededd fd            Z!eej         fdeez  dedd fd            Z"	 d*de#e$         de%dz  defd Z&de'defd!Z(dedefd"Z)d*d#e*e%         d$e+dz  defd%Z,d#e*e%         defd&Z-dS )+MistralTokenizerag  Mistral tokenizer.

    This class is a wrapper around a [InstructTokenizer][mistral_common.tokens.tokenizers.base.InstructTokenizer],
    a [MistralRequestValidator][mistral_common.protocol.instruct.validator.MistralRequestValidator] and a
    [InstructRequestNormalizer][mistral_common.protocol.instruct.normalize.InstructRequestNormalizer].

    It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

    Attributes:
        instruct_tokenizer: The instruct tokenizer to use. See
            [InstructTokenizer][mistral_common.tokens.tokenizers.instruct.InstructTokenizer].
    instruct_tokenizer	validatorrequest_normalizerc                 0    || _         || _        || _        dS )zInitializes a `MistralTokenizer`.

        Args:
            instruct_tokenizer: The instruct tokenizer to use.
            validator: The request validator to use.
            request_normalizer: The request normalizer to use.
        N)"_chat_completion_request_validator_instruct_request_normalizerrL   )selfrL   rM   rN   s       r:   __init__zMistralTokenizer.__init__u   s%     3</,>) 	r<   r2   .c                 T    t           j        | j        j        j        | j        j        ffS )z
        Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

        Returns:
            A tuple of the factory function and the arguments to reconstruct the object from its source file.
        )rK   	from_filerL   r1   	file_pathrP   _mode)rR   s    r:   
__reduce__zMistralTokenizer.__reduce__   s.      )#-739,
 
 	
r<   c                 F    t          t                    j        d         dz  S )N   data)r   __file__parentsclss    r:   
_data_pathzMistralTokenizer._data_path   s    H~~%a(611r<   c                     |                      t          |                                 dz            t          j                  S )zGet the Mistral tokenizer v1.ztokenizer.model.v1moderU   rG   r`   r   testr^   s    r:   v1zMistralTokenizer.v1   s6     }}S!1!14H!HIIP^Pc}dddr<   c                     |                      t          |                                 dz            t          j                  S )zGet the Mistral tokenizer v2.z*mistral_instruct_tokenizer_240216.model.v2rb   rd   r^   s    r:   v2zMistralTokenizer.v2   s@     }}  #OOPPWeWj  
 
 	
r<   Fr.   is_mmc                     |r|rd}n|r|sd}n|s|rt          d          d}|                     t          |                                 |z            t          j                  S )a;  Get the Mistral tokenizer v3.

        Args:
            is_tekken: Whether the tokenizer is a tekken tokenizer. See
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
            is_mm: Whether to load image tokenizer.

        Returns:
            The Mistral tokenizer v3.
        ztekken_240911.jsonztekken_240718.jsonz;Multimodal tokenizer is currently only supported for tekkenz*mistral_instruct_tokenizer_240323.model.v3rb   )
ValueErrorrU   rG   r`   r   re   )r_   r.   ri   tokenizer_names       r:   v3zMistralTokenizer.v3   s      	J 	J1NN 	Ju 	J1NN 	Ju 	JZ[[[IN}}S!1!1N!BCC.J]}^^^r<   c                    |rC|                      t          |                                 dz            t          j                  S |                      t          |                                 dz            t          j                  S )zGet the Mistral tokenizer v7.

        Args:
            is_mm: Whether to load the image tokenizer.

        Returns:
            The Mistral tokenizer v7.
        z,mistral_instruct_tokenizer_241114.model.v7m1rb   z*mistral_instruct_tokenizer_241114.model.v7rd   )r_   ri   s     r:   v7zMistralTokenizer.v7   s      	==CNN$$'UUVV]k]p !    ==CNN$$'SSTT[i[n !   r<   modelstrictc                    |s[t          j        dt                     t                                          D ]'\  }}||                                v r |            c S (|t          vrt          d|           t          |                     S )ax  Get the Mistral tokenizer for a given model.

        Args:
            model: The model name.
            strict: Whether to use strict model name matching. If `False`, the model name is matched as a substring.
                This is deprecated and will be removed in `mistral_common=1.10.0`.

        Returns:
            The Mistral tokenizer for the given model.
        a  Calling `MistralTokenizer.from_model(..., strict=False)` is deprecated as it can lead to incorrect tokenizers. It is strongly recommended to use MistralTokenizer.from_model(..., strict=True)` which will become the default in `mistral_common=1.10.0`.If you are using `mistral_common` for open-sourced model weights, we recommend using `MistralTokenizer.from_file('<path/to/tokenizer/file>')` instead.zUnrecognized model: )warningswarnFutureWarningMODEL_NAME_TO_TOKENIZER_CLSitemslowerr   )r_   rp   rq   
model_nametokenizer_clss        r:   
from_modelzMistralTokenizer.from_model   s      	+MT
    .I-N-N-P-P + +)
M..(=??*** / 333$%CE%C%CDDD*51333r<   Nrepo_idr?   revisionforce_downloadlocal_files_onlyrc   c                 b    t          | ||||          }t                              ||          S )aO  Download the Mistral tokenizer for a given Hugging Face repository ID.

        See [here](https://huggingface.co/mistralai/models) for a list of our OSS models.

        Args:
            repo_id: The Hugging Face repo ID.
            token: The Hugging Face token to use to download the tokenizer.
            revision: The revision of the model to use. If `None`, the latest revision will be used.
            mode: The validation mode to use.
            force_download: Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded
                even if it is already cached.
            local_files_only: Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is
                already cached.

        Returns:
            The Mistral tokenizer for the given model.
        )r|   r?   r}   r~   r   rb   )r/   rK   rU   )r|   r?   r}   r~   r   rc   tokenizer_paths          r:   from_hf_hubzMistralTokenizer.from_hf_hub   sB    4 8)-
 
 
  )).t)DDDr<   tokenizer_filenamec                 j   t          |          r#t          j        |          }|j        }|j        }nBt          |          r!t          |          }t          |          }d}nt          d|           |t          ||          nd}d}|/t          |t                    s
J d            t          ||          }t          |j                  }|j        t          j        k    rE|
J d            |
J d            t!          t#          |          t%          |          |          S |j        t          j        k    rE|
J d            |
J d            t!          t)          |          t%          |          |          S |j        t          j        k    r;|
J d            t!          t-          ||          t/          |          |          S |j        t          j        k    r0t!          t3          |||	          t5          |          |          S |j        t          j        k    r0t!          t9          |||	          t5          |          |          S |j        t          j        k    r0t!          t=          |||	          t?          |          |          S t          d
|           )zLoads a tokenizer from a file.

        Args:
            tokenizer_filename: The path to the tokenizer file.
            mode: The validation mode to use.

        Returns:
            The loaded tokenizer.
        NzUnrecognized tokenizer file: z-Audio is only supported for tekken tokenizersz#Tokenizer version needs to be >= v3z#Tokenizer version needs to be >= v7rb   )rM   rN   )image_encoder)r   audio_encoderz!Unrecognized tokenizer filename: ) r.   r-   rU   imagerD   r,   r*   r+   r   r;   
isinstancerI   r   versionr    rf   rK   r$   r   rh   r%   rm   r&   r   ro   r'   r   v11r(   v13r)   r   )	r_   r   rc   r1   r0   r=   r   r   rN   s	            r:   rU   zMistralTokenizer.from_file  s1   " '(( 
	[",-?@@I$?L$?LL011 	[./ABBI+,>??LLL$%YEW%Y%YZZZGSG_*<CCCei#i44ee6eee4.|YGGM=i>OPP 0 333 ((*O((( ((*O(((##I..1t<<<#5   
 "2"555 ((*O((( ((*O(((##I..1t<<<#5   
 "2"555 ((*O(((##I]KKK3>>>#5   
 "2"555##I]Zghhh3>>>#5   
 "2"666#$Ym[hiii3>>>#5   
 "2"666#$Ym[hiii4$???#5    !!YEW!Y!YZZZr<   requestmax_model_input_lenc                     | j                             |          }||j        rt          d          | j                            |          }|j        r||_        | j                            |          S )aD  Encodes a chat completion request.

        Args:
            request: The chat completion request to encode.
            max_model_input_len: The maximum length of the input to the model.
                If `None`, the input will not be truncated.

        Returns:
            The encoded chat completion request.
        NzUencoding a chat completion request with truncation, but no max model len was provided)	rP   validate_requesttruncate_for_context_lengthr   rQ   from_chat_completion_requesttruncate_at_max_tokensrL   encode_instruct)rR   r   r   validated_requestinstruct_requests        r:   encode_chat_completionz'MistralTokenizer.encode_chat_completionf  s     !CTTU\]]&7+N& %g    <YYZkll. 	J6I3&667GHHHr<   c                 6    | j                             |          S )zEncodes a transcription request.

        Args:
            request: The transcription request to encode.

        Returns:
            The encoded transcription request.
        )rL   encode_transcriptionrR   r   s     r:   r   z%MistralTokenizer.encode_transcription  s     &;;GDDDr<   c                 6    | j                             |          S )zEncodes a fill in the middle request.

        Args:
            request: The fill in the middle request to encode.

        Returns:
            The encoded fill in the middle request.
        )rL   
encode_fimr   s     r:   r   zMistralTokenizer.encode_fim  s     &11':::r<   tokensspecial_token_policyc                 :    | j                             ||          S )a_  Decodes a list of tokens into a string.

        Args:
            tokens: The tokens to decode.
            special_token_policy: The policy to use for special tokens. Passing `None` is deprecated and will be changed
                to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        )r   )rL   decode)rR   r   r   s      r:   r   zMistralTokenizer.decode  s!     &--fK_-```r<   c                 6    | j                             |          S rA   )rL   
_to_string)rR   r   s     r:   r   zMistralTokenizer._to_string  s    &11&999r<   )r2   rK   )FF)FrA   ).__name__
__module____qualname____doc__r   r   r   r   r
   r   r   r   r   r   r   rS   tupler   r   rX   classmethodr   r`   rf   rh   boolrm   ro   rG   r{   staticmethodr   re   r   rU   r   r	   rH   r   r   r   r   listr   r   r    r<   r:   rK   rK   e   s        
-oz=Zn.no
 +?<PRact+tu
 61?DUWjj
	
 
 
 
*

E(E#s(O";< 

 

 

 

 24 2 2 2 [2 e e e [e 
 
 
 [
 _ _4 _ _AS _ _ _ [_,  t (:    [$ 4 4s 4D 4=O 4 4 4 [4@  $(#$!&-2 E  E EczD  E * E 	 E
  E  E 
 E  E  E \ ED   .2O[ O[$JO[ O[ 
	O[ O[ O[ [O[d W[I I,T2IILtI	I I I I>	E,@ 	E] 	E 	E 	E 	E	;* 	; 	; 	; 	; 	;a aT#Y a>PSW>W acf a a a a:c :s : : : : : :r<   rK   zministral-8b-2410c                  8    t                               d          S NT)r.   rK   rm   r   r<   r:   <lambda>r     s    !1!4!4t!4!D!D r<   zmistral-tiny-2312zopen-mistral-nemo-2407c                  8    t                               d          S r   r   r   r<   r:   r   r     s    &6&9&9D&9&I&I r<   zmistral-tiny-2407zmistral-small-2312zopen-mixtral-8x22b-2404zmistral-small-2402zmistral-small-2409c                  8    t                               d          S r   r   r   r<   r:   r   r     s    "2"5"5"5"E"E r<   zmistral-medium-2312zmistral-large-2402zmistral-large-2407zmistral-large-2411zpixtral-large-2411c                  8    t                               d          S NT)ri   rK   ro   r   r<   r:   r   r     s    "2"5"5D"5"A"A r<   zcodestral-2405zcodestral-mamba-2407zpixtral-12b-2409c                  :    t                               dd          S NT)r.   ri   r   r   r<   r:   r   r     s     0 3 3d$ 3 O O r<   zopen-mistral-7bc                  8    t                               d          S r   r   r   r<   r:   r   r     s    ,//$/?? r<   c                  :    t                               dd          S r   r   r   r<   r:   r   r     s    '**T*FF r<   c                  8    t                               d          S r   r   r   r<   r:   r   r     s    -00t0<< r<   )zopen-mixtral-8x7bzmistral-embedzmistral-small-v1zmistral-large-v1zmistral-smallzmistral-largezopen-mixtral-8x22bzcodestral-22bzmistral-nemopixtralzpixtral-largerv   )Irs   pathlibr   typingr   r   r   mistral_common.exceptionsr   #mistral_common.protocol.fim.requestr   )mistral_common.protocol.instruct.messagesr	   r
   r   r   r   *mistral_common.protocol.instruct.normalizer   r   (mistral_common.protocol.instruct.requestr   *mistral_common.protocol.instruct.validatorr   r   r   r   r   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   r   r   %mistral_common.tokens.tokenizers.baser   r   r   r   r   r   r    &mistral_common.tokens.tokenizers.imager!   r"   r#   )mistral_common.tokens.tokenizers.instructr$   r%   r&   r'   r(   r)   .mistral_common.tokens.tokenizers.sentencepiecer*   r+   r,   'mistral_common.tokens.tokenizers.tekkenr-   r.   &mistral_common.tokens.tokenizers.utilsr/   r;   rI   rK   rh   rm   rf   ro   rv   dictrG   __annotations__r   r<   r:   <module>r      s          ) ) ) ) ) ) ) ) ) )      ; : : : : :              s r r r r r r r J J J J J J              O N N N N N ] ] ] ] ] ] ] ] ] ]                          
                        
 J I I I I I I I Q Q Q Q Q Q3[ 3ZJ`=` 3eq 3 3 3 3$3[ 3Z 3L 3 3 3 32D: D: D: D: D:O1?DUWddeD: D: D:N
JDDJ),J IIJ ),	J
 *-J /2J *-J EEJ +.J *-J *-J *-J AAJ &)J ,/J  OO!J$ '*%J& *,%((+(+%(%(*-%(??FF<<;J J J T#x4D0D'E"EF     r<   