
    *`i5              	          d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$  G d de%e          Z& G d de%e          Z' G d de(e          Z) G d de%e          Z* G d de          Z+ G d de          Z, e
de          Z- e
de          Z. e
d e+          Z/ G d! d"e	e-e.e/ef                   Z0dS )#    N)ABCabstractmethod)Enum)Path)GenericTypeVar)
ConfigDictField)Audio)MistralBase)
FIMRequest)UserContentChunk)AssistantMessageTypeUserMessage)InstructRequest)Tool)TranscriptionRequest)AudioEncoder)ImageEncoderc                       e Zd ZdZdZdZdS )UserMessagePositionzWhere to encode available toolsfirstlastN)__name__
__module____qualname____doc__r   r        y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/mistral_common/tokens/tokenizers/base.pyr   r      s        ))EDDDr   r   c                       e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZ dS ) SpecialTokensa(  Enum of special tokens used in the tokenizer.

    Attributes:
        unk: The unknown token.
        bos: The beginning of string token.
        eos: The end of string token.
        begin_inst: The beginning of instruction token.
        end_inst: The end of instruction token.
        begin_tools: The beginning of tools token.
        end_tools: The end of tools token.
        begin_tool_results: The beginning of tool results token.
        end_tool_results: The end of tool results token.
        tool_calls: The tool calls token.
        img: The image token.
        pad: The pad token.
        img_break: The image break token.
        img_end: The image end token.
        prefix: The prefix token for FIM.
        middle: The middle token for FIM.
        suffix: The suffix token for FIM.
        begin_system: The beginning of system prompt token.
        end_system: The end of system prompt token.
        begin_tool_content: The beginning of tool content token.
        args: The args token.
        call_id: The call id token.
        audio: The audio token.
        begin_audio: The beginning of audio token.
        transcribe: The transcribe token.
        begin_think: The beginning of think token.
        end_think: The end of think token.

    Examples:
        >>> unk = SpecialTokens.unk
    z<unk>z<s>z</s>z[INST]z[/INST]z[AVAILABLE_TOOLS]z[/AVAILABLE_TOOLS]z[TOOL_RESULTS]z[/TOOL_RESULTS]z[TOOL_CALLS]z[IMG]z<pad>z[IMG_BREAK]z	[IMG_END]z[PREFIX]z[MIDDLE]z[SUFFIX]z[SYSTEM_PROMPT]z[/SYSTEM_PROMPT]z[TOOL_CONTENT]z[ARGS]z	[CALL_ID]z[AUDIO]z[BEGIN_AUDIO]z[TRANSCRIBE]z[THINK]z[/THINK]z[STREAMING_PAD]z[STREAMING_WORD]N)!r   r   r   r   unkboseos
begin_instend_instbegin_tools	end_toolsbegin_tool_resultsend_tool_results
tool_callsimgpad	img_breakimg_endprefixmiddlesuffixbegin_system
end_systembegin_tool_contentargscall_idaudiobegin_audio
transcribebegin_think	end_thinkstreaming_padstreaming_wordr   r   r    r"   r"       s        ! !F C
C
CJH%K$I)(J
C
CIGFFF$L#J)DGE!KJKI%M'NNNr   r"   c                       e Zd ZdZdZdZdZdS )SpecialTokenPolicyzWhat to do with special tokens when encoding/decoding.

    Attributes:
        IGNORE: Ignore special tokens.
        KEEP: Keep special tokens.
        RAISE: Raise an error if special tokens are found.
    r         N)r   r   r   r   IGNOREKEEPRAISEr   r   r    rA   rA   c   s)          FDEEEr   rA   c                       e Zd ZdZdedd fdZedefd            Zddde	fdZ
ddde	fd	Zddde	fd
Zddde	fdZdZdZdZdZdZdZdS )TokenizerVersiona  Enum of tokenizer versions.

    Allow to distinguish between different versions of the tokenizer and maintain backward compatibility.

    Attributes:
        v1: The first version of the tokenizer.
        v2: The second version of the tokenizer that includes special control tokens [INST], [\INST].
        v3: The third version of the tokenizer that includes improved function calling.
        v7: The seventh version of the tokenizer that includes improved system prompt and function calling.
        v11: The eleventh version of the tokenizer that includes improved function calling.
        v13: The thirteenth version of the tokenizer that includes no call id tokenization and better prompt caching.

    Examples:
        >>> version = TokenizerVersion.v1
    valuereturnc                     t          j        d|          st          d| d          t                              | |          }||_        |S )Nz^v\d+$zInvalid version format: z#. Must be 'v' followed by a number.)rematch
ValueErrorstr__new___value_)clsrI   objs      r    rP   zTokenizerVersion.__new__   sP    x	5)) 	dbbbbccckk#u%%
r   c                 :    t          | j        dd                    S )NrB   )intrI   selfs    r    _version_numzTokenizerVersion._version_num   s    4:abb>"""r   otherzstr | TokenizerVersionc                 j    t          |t                    rt          |          }| j        |j        k     S N
isinstancerO   rH   rX   rW   rY   s     r    __lt__zTokenizerVersion.__lt__   s2    eS!! 	,$U++E 5#555r   c                 n    t          |t                    rt          |          }| j        |j        k    S d S r[   r\   r^   s     r    __le__zTokenizerVersion.__le__   <    eS!! 	;$U++E$(:::	; 	;r   c                 n    t          |t                    rt          |          }| j        |j        k    S d S r[   r\   r^   s     r    __gt__zTokenizerVersion.__gt__   s<    eS!! 	:$U++E$u'999	: 	:r   c                 n    t          |t                    rt          |          }| j        |j        k    S d S r[   r\   r^   s     r    __ge__zTokenizerVersion.__ge__   rb   r   v1v2v3v7v11v13N)r   r   r   r   rO   rP   propertyrU   rX   boolr_   ra   rd   rf   rg   rh   ri   rj   rk   rl   r   r   r    rH   rH   q   s         C $6     #c # # # X#64 6 6 6 6 6
;4 ; ; ; ; ;
:4 : : : : :
;4 ; ; ; ; ;
 
B	B	B	B
C
CCCr   rH   c                       e Zd ZU dZ ed          Zee         ed<   dZ	e
dz  ed<   dZee         dz  ed<    ee          Zeej                 ed	<    ee          Zee         ed
<   dS )	Tokenizeda  A tokenized [`InstructRequest`][mistral_common.tokens.instruct.request].

    Attributes:
        tokens: The token ids.
        text: The text representation of the tokens.
        prefix_ids: The prefix ids for FIM.
        images: The loaded images associated with the tokens.

    Examples:
        >>> tokenized = Tokenized(tokens=[1, 2, 3], text="Hello world", prefix_ids=[1], images=[])
    T)arbitrary_types_allowedtokensNtext
prefix_ids)default_factoryimagesaudios)r   r   r   r   r	   model_configlistrU   __annotations__rs   rO   rt   r
   rv   npndarrayrw   r   r   r   r    rp   rp      s         
 
 :d;;;LID#*#'JS	D '''$uT:::FD:::%555FDK55555r   rp   c            
          e Zd Zeedefd                        Zedee         fd            Z	ededefd            Z
eedefd                        Zeedefd                        Zeedefd                        Zeedefd	                        Zed
edededee         fd            Zeddee         dedz  defd            Zed
edefd            Zedeej        z  ez  defd            Zeedefd                        Zedee         defd            Zedee         defd            Zeedefd                        ZdS )	TokenizerrJ   c                     dS )z!Vocabulary size of the tokenizer.Nr   rV   s    r    n_wordszTokenizer.n_words         r   c                     dS )z(All tokens in the vocabulary as strings.Nr   rV   s    r    vocabzTokenizer.vocab   r   r   token_idc                     dS )z$Convert a token id to the token str.Nr   )rW   r   s     r    id_to_piecezTokenizer.id_to_piece   r   r   c                     dS )z$id of the Beginning of String token.Nr   rV   s    r    bos_idzTokenizer.bos_id   r   r   c                     dS )zid of the End of String token.Nr   rV   s    r    eos_idzTokenizer.eos_id   r   r   c                     dS )zid of the Pad token.Nr   rV   s    r    pad_idzTokenizer.pad_id   r   r   c                     dS )zid of the Unk token.Nr   rV   s    r    unk_idzTokenizer.unk_id   r   r   sr$   r%   c                     dS )z(Convert a string to a list of token ids.Nr   )rW   r   r$   r%   s       r    encodezTokenizer.encode   r   r   Nrr   special_token_policyc                     dS )a  Decode the token ids to a string.

        Args:
            tokens: The token ids to decode.
            special_token_policy: The policy to use for special tokens.
                Passing `None` will default to `self._special_token_policy` for
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
                for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
                Note that passing `None` will be deprecated and `special_token_policy` will default to
                `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        Nr   rW   rr   r   s      r    decodezTokenizer.decode   r   r   c                     dS )zGet the id of a control token.Nr   )rW   r   s     r    get_special_tokenzTokenizer.get_special_token   r   r   tokenc                     dS )z2Check if token id or token str is a special token.Nr   )rW   r   s     r    
is_specialzTokenizer.is_special   r   r   c                     dS )z!Get the version of the tokenizer.Nr   rV   s    r    versionzTokenizer.version   r   r   c                     dS )z[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

        Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

        This is a convenient method for debugging.
        Nr   rW   rr   s     r    	to_stringzTokenizer.to_string  s	     	r   c                     d S r[   r   r   s     r    
_to_stringzTokenizer._to_string      47Cr   c                     dS )zThe file path of the tokenizer.Nr   rV   s    r    	file_pathzTokenizer.file_path  s	     	r   r[   )r   r   r   rm   r   rU   r   ry   rO   r   r   r   r   r   r   rn   r   rA   r   r   r{   integerr   rH   r   r   r   r   r   r   r   r    r~   r~      s       1 1 1 1 ^ X1 8tCy 8 8 8 ^8 4C 4C 4 4 4 ^4 4 4 4 4 ^ X4 . . . . ^ X. $ $ $ $ ^ X$ $ $ $ $ ^ X$ 7 7$ 7T 7d3i 7 7 7 ^7  T#Y >PSW>W cf    ^  .3 .3 . . . ^. Bbj 03 6 B4 B B B ^B 1) 1 1 1 ^ X1 S	 c    ^ 7c7s777 ^74    ^ X  r   r~   InstructRequestType)boundFIMRequestTypeTokenizedTypec                   B   e Zd ZU dZeed<   edz  ed<   edz  ed<   dededz  dedz  ddfdZe	de
defd	            Ze	dedefd
            Ze	ddee         dedz  defd            Ze	dedefd            Ze		 	 ddedee         dz  dedededz  dedeee         eej                 ee         f         fd            Ze		 	 ddeee         z  dededz  dedeee         eej                 ee         f         f
d            Ze	dee         defd            Z dS )InstructTokenizerzBase class for instruct tokenizers.

    Attributes:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use if any.
    	tokenizerNimage_encoderaudio_encoderrJ   c                     dS )zInitialize the instruct tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use if any.
            audio_encoder: The audio encoder to use if any.
        Nr   )rW   r   r   r   s       r    __init__zInstructTokenizer.__init__'  r   r   requestc                     dS )zInstruct request to Tokenized object

        Args:
            request: The instruct request to encode.

        Returns:
            The tokenized instruct request.
        Nr   rW   r   s     r    encode_instructz!InstructTokenizer.encode_instruct2  r   r   c                     dS )a  
        Encodes an audio transcription request into a tokenized format.

        This method processes a transcription request containing audio data,
        encodes the user message, and returns the tokenized output.

        Args:
            request: The transcription request object containing
                the audio data to be encoded.

        Returns:
            Tokenized: The tokenized representation of the audio data, including processed audio and tokens
        Nr   r   s     r    encode_transcriptionz&InstructTokenizer.encode_transcription=  s	     	r   rr   r   c                     dS )a  Convert token ids to string

        Args:
            tokens: The token ids to decode.
            special_token_policy: The policy to use for special tokens.
                Passing `None` will default to `self._special_token_policy` for
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
                for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
                Note that passing `None` will be deprecated and `special_token_policy` will default to
                `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        Nr   r   s      r    r   zInstructTokenizer.decodeN  r   r   c                     dS )zFIM request to Tokenized object

        Args:
            request: The FIM request to encode.

        Returns:
            The tokenized FIM request.
        Nr   r   s     r    
encode_fimzInstructTokenizer.encode_fim_  r   r   Fmessageavailable_toolsis_lastis_firstsystem_promptforce_img_firstc                     dS )a  Encode a user message.

        Args:
            message: The user message to encode.
            available_tools: The available tools.
            is_last: Whether the message is the last one.
            is_first: Whether the message is the first one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and images.
        Nr   )rW   r   r   r   r   r   r   s          r    encode_user_messagez%InstructTokenizer.encode_user_messagej  s	    . 	r   contentc                     dS )aI  Encode a user content.

        Args:
            content: The user content to encode.
            is_last: Whether the content is the last one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and images.
        Nr   )rW   r   r   r   r   s        r    encode_user_contentz%InstructTokenizer.encode_user_content  s	    & 	r   c                     d S r[   r   r   s     r    r   zInstructTokenizer._to_string  r   r   r[   )NF)!r   r   r   r   r~   rz   r   r   r   r   r   r   r   r   r   ry   rU   rA   rO   r   r   r   r   r   rn   tupler{   r|   r   r   r   r   r   r   r   r    r   r     s          $&&&&$&&&&	"	3?$3F	WcfjWj			 	 	 	 ': }    ^ ,@ ]    ^   T#Y >PSW>W cf    ^  . ]    ^  %) %  dd* 	
  Tz  
tCy$rz*DK7	8   ^0 
 %) % t,--  Tz	
  
tCy$rz*DK7	8   ^( 7c7s777 ^777r   r   )1rL   abcr   r   enumr   pathlibr   typingr   r   numpyr{   pydanticr	   r
   mistral_common.audior   mistral_common.baser   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   )mistral_common.protocol.instruct.messagesr   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   &mistral_common.tokens.tokenizers.imager   rO   r   r"   rU   rA   rH   rp   r~   r   r   r   r   r   r   r    <module>r      s   				 # # # # # # # #             # # # # # # # #     & & & & & & & & & & & & & & + + + + + + : : : : : : C C C C C C        E D D D D D < < < < < < N N N N N N ? ? ? ? ? ? ? ? ? ? ? ?    #t   @( @( @( @( @(C @( @( @(F    d   5 5 5 5 5sD 5 5 5p6 6 6 6 6 6 6 6*U U U U U U U Up g3?KKK )<<<y999~8 ~8 ~8 ~8 ~8 3^]Th hi ~8 ~8 ~8 ~8 ~8r   