
    *`iD              
       N   d dl Z d dlmZ d dlmZmZmZmZ d dlZ	d dl
mZ d dlmZmZmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZmZmZm Z m!Z!m"Z" d d	l#m$Z$ d d
l%m&Z&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z;  G d de1ee0e/e5ef                   Z< G d de<ee0e/e5ef                   Z= G d de=ee0e/e5ef                   Z> G d de>ee0e/e5ef                   Z? G d de?          Z@ G d de@          ZA G d deA          ZBdS )    N)abstractmethod)AnyGenericSequenceoverload)Audio) InvalidAssistantMessageException InvalidMessageStructureExceptionInvalidRequestExceptionTokenizerException)
FIMRequest)
AudioChunkAudioURLChunkContentChunk
ImageChunkImageURLChunk	TextChunk
ThinkChunkUserContentChunk)UATSAssistantMessageAssistantMessageTypeSystemMessageToolMessageUserMessage)InstructRequest)ToolToolCall)StreamingModeTranscriptionRequest)AudioEncoderTranscriptionFormat)	FIMRequestTypeInstructRequestTypeInstructTokenizerSpecialTokenPolicySpecialTokens	TokenizedTokenizedType	TokenizerUserMessagePosition)ImageEncoder)
Tekkenizerc            
           e Zd ZdZ	 	 ddededz  dedz  f fdZededz  fd            Z	de
e         fd	Zed
edeeef         fd            Zededede
e         fd            Zedededede
e         fd            Zedede
e         fd            Zde
e
e         dz           de
e         dededdf
dZede
e         ddfd            Zd
eeef         defdZdde
e         de dz  de!fdZ"de
e         de!fdZ# xZ$S ) InstructTokenizerBasezBase instruct tokenizer.N	tokenizerimage_encoderaudio_encoderc                 x    || _         || _        || _        t                                          |||           dS )zInitialize the instruct tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use if any.
            audio_encoder: The audio encoder to use.
        N)r0   r1   r2   super__init__selfr0   r1   r2   	__class__s       }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/mistral_common/tokens/tokenizers/instruct.pyr5   zInstructTokenizerBase.__init__9   s>     #**M=AAAAA    returnc                     | j         S N)r1   r7   s    r9   
mm_encoderz InstructTokenizerBase.mm_encoderK   s    
 !!r:   c                     | j         j        gS )zReturn the start tokens.)r0   bos_idr>   s    r9   startzInstructTokenizerBase.startR   s    %&&r:   requestc                     d}d}t          t          | j                            D ]$\  }}t          |t                    r
|dk    r|}|}%||fS )zFind the first and last user message in the request.

        Args:
            request: The request to search for user messages.

        Returns:
            The index of the first and last user message.
        )list	enumeratemessages
isinstancer   )rC   last_user_idxfirst_user_idximsgs        r9   find_first_last_userz*InstructTokenizerBase.find_first_last_userV   sg     9W%56677 	" 	"FAs#{++ "!R''%&N !},,r:   messageis_before_last_user_messagec                      t          d          )zEncode a tool message.

        Raises:
            NotImplementedError: The tool message is not implemented for the base tokenizer.
        zTool message not implementedNotImplementedErrorr7   rO   rP   s      r9   encode_tool_messagez)InstructTokenizerBase.encode_tool_messagei   s     ""@AAAr:   continue_messagec                      t          d          )zEncode an assistant message.

        Raises:
            NotImplementedError: The assistant message is not implemented for the base tokenizer.
        z!Assistant message not implementedrR   )r7   rO   rP   rV   s       r9   encode_assistant_messagez.InstructTokenizerBase.encode_assistant_messager   s     ""EFFFr:   chunkc                      t          d          )zEncode a think chunk.

        Raises:
            NotImplementedError: The think chunk is not implemented for the base tokenizer.
        zThink chunk not implementedrR   r7   rY   s     r9   encode_thinkz"InstructTokenizerBase.encode_think}   s     ""?@@@r:   	tokenizedrH   
max_tokenslast_user_message_indexc                     d S r=    )r7   r]   rH   r^   r_   s        r9   _truncate_for_max_tokensz.InstructTokenizerBase._truncate_for_max_tokens   s	     	r:   c                     d S r=   ra   clsrH   s     r9   validate_messagesz'InstructTokenizerBase.validate_messages   s	     	r:   c           	      4   g }g }d}g }|                      |j                   |                     |          \  }}t          |j                  D ]\  }}	|j        r?|t          |j                  dz
  k    r$t          |	t                    st          d          t          |	t                    r\| 
                    |	|j        ||k    ||k    |j        d          \  }
}}|                    |           |                    |           nt          |	t                    r|                     |	||k               }
nt          |	t                    r\|j        o|t          |j                  dz
  k    }|                     |	||k     |          }
|t          |j                  dz
  k    r|
}nJt          |	t"                    r|                     |	          }
nt'          dt)          |	                     |                    |
           |j        "|                     ||j        |j        |           |                                 }|D ]}||                    |           t3          ||                     |t6          j                  |||	          S )
zEncode an instruct request.

        Args:
            request: The request to encode.

        Returns:
            The encoded tokens.
        N   z?Cannot continue final message if it is not an assistant messageT)system_promptforce_img_first)rV   zUnknown message type special_token_policy)tokenstext
prefix_idsimagesaudios)rf   rH   rN   rG   continue_final_messagelenrI   r   r
   r   encode_user_messageavailable_toolsri   extendr   rU   rX   r   encode_system_messager   typeappendtruncate_at_max_tokensrb   rB   r(   decoder&   KEEP)r7   rC   rp   rq   ro   tokens_listrK   rJ   msg_idxrM   
new_tokens
new_images
new_audiosrV   rm   toks                   r9   encode_instructz%InstructTokenizerBase.encode_instruct   s    $& '+
.0 	w/000 )-(A(A'(J(J%%g&677 #	+ #	+LGS.G$4 5 5 999"3(899 : 7U   #{++ N595M5M+},~-")"7$( 6N 6 62
J
 j)))j))))C-- N!55c7];RSS

C!122 N#*#A#lwRUV]VfRgRgjkRkGk !::=0CS ;  
 c'"233a777!+JC// N!77<<

()Lc)L)LMMMz****)5)) .	    	# 	#Cc"""V:L:QRR!
 
 
 	
r:   rm   rl   c                 :    | j                             ||          S )a  Decode tokens to a string.

        Args:
            tokens: The tokens to decode.
            special_token_policy: The policy to use for special tokens.
                Passing `None` will default to `self._special_token_policy` for
                [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
                for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
                Note that passing `None` will be deprecated and `special_token_policy` will default to
                `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

        Returns:
            The decoded string.
        rk   )r0   r{   )r7   rm   rl   s      r9   r{   zInstructTokenizerBase.decode   s      ~$$VBV$WWWr:   c                 6    | j                             |          S r=   )r0   
_to_string)r7   rm   s     r9   r   z InstructTokenizerBase._to_string   s    ~((000r:   NNr=   )%__name__
__module____qualname____doc__r*   r,   r!   r5   propertyr?   rF   intrB   staticmethodr   tuplerN   r   r   boolrU   r   rX   r   r\   rb   classmethodr   rf   r   r(   r   r&   strr{   r   __classcell__r8   s   @r9   r/   r/   4   s        $#
 .2-1	B BB $d*B $d*	B B B B B B$ "L4/ " " " X"'tCy ' ' ' ' -o -%S/ - - - \-$ B; BUY B^bcf^g B B B ^B G+GJNGbfG	cG G G ^G A* Ac A A A ^AS	D() +, 	
 "% 
    d     [O
 !5t!;<O
 
O
 O
 O
 O
bX XT#Y X>PSW>W Xcf X X X X"1c 1s 1 1 1 1 1 1 1 1r:   r/   c                      e Zd ZdZ	 	 ddedee         dz  dedededz  d	ed
e	ee
         eej                 ee         f         fdZded
ee
         fdZ	 	 ddeee         z  dededz  d	ed
e	ee
         eej                 ee         f         f
dZdeded
ee
         fdZdededed
ee
         fdZded
ee
         fdZded
efdZded
efdZdS )InstructTokenizerV1zrInstruct tokenizer V1.

    This tokenizer has basic for messages. It does not support tools or image inputs.
    NFrO   ru   is_lastis_firstri   rj   r;   c                     t          |j        t                    s
J d            | j        
J d            d}|r|r|dz   |j        z   }n|j        }d| d}|                     |dd	          \  }	}
}|	|
|fS )
ar  Encode a user message.

        Args:
            message: The message to encode.
            available_tools: Not used.
            is_last: Not used.
            is_first: Whether the message is the first one.
            system_prompt: The system prompt.
            force_img_first: Not used.

        Returns:
            The encoded tokens and empty list.
        "Message content must be normalizedNz(InstructTokenizerV1 cannot encode images 

z[INST] z [/INST]F)contentr   ri   )rI   r   r   r1   encode_user_content)r7   rO   ru   r   r   ri   rj   r   message_txtcurr_tokensimageaudios               r9   rt   z'InstructTokenizerV1.encode_user_message  s    , '/3//UU1UUU/!))+U))) 	& 	&#f,w>GGoG1111$($<$<[Z_os$<$t$t!UEE5((r:   c                 :    t          d| j        j                   )Nz,System message encoding not implemented for )rS   r8   r   r7   rO   s     r9   rw   z)InstructTokenizerV1.encode_system_message&  s     !"jQUQ_Qh"j"jkkkr:   r   c                     t          |t                    sJ |r
|r|dz   |z   }| j                            |dd          }|g g fS )a*  Encode a user content.

        Args:
            content: The content to encode.
            is_last: Whether the message is the last one.
            system_prompt: The system prompt.
            force_img_first: Not used.

        Returns:
            The encoded tokens and empty list.
        r   Fboseos)rI   r   r0   encode)r7   r   r   ri   rj   rm   s         r9   r   z'InstructTokenizerV1.encode_user_content)  s`    $ '3''''' 	7} 	7#f,w6G&&wEu&EEr2~r:   rP   c                      t          d          )zEncode a tool message.

        Raises:
            TokenizerException: The tool message is not implemented for this version.
        &Tools not implemented for tokenizer V1r   rT   s      r9   rU   z'InstructTokenizerV1.encode_tool_messageC  s     !!IJJJr:   rV   c                    t          |t                    s
J |            |j        't          |j                  dk    rt	          d          |r|j        rt          d          |j        rGt          |j        t                    s
J d            | j	        
                    |j        dd          }nt	          |j         d|j                   |j        s!|s|                    | j	        j                   |S )	[  Encode an assistant message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.
            continue_message: Whether to continue the message generation.
                Only use this if the assistant message is the last message.

        Returns:
            The encoded tokens.
        Nr   r   U`continue_message` is only supported for assistant messages that have `prefix=False`.z4Message content must be a string for tokenizer < V13Fr   z // )rI   r   
tool_callsrs   r   prefixr	   r   r   r0   r   ry   eos_idr7   rO   rP   rV   r   s        r9   rX   z,InstructTokenizerV1.encode_assistant_messageK  s    '#344==g==4)c'2D.E.E.I.I$%MNNN 	S 	S2g   _ 	Sgos33kk5kkk3.//UPU/VVKK$%Q%QW=O%Q%QRRR~ 	6&6 	6t~4555r:   rY   c                      t          d          )zEncode a think chunk.

        Raises:
            TokenizerException: The think chunk is not implemented for this version.
        z*Think not implemented for tokenizer < V13.r   r[   s     r9   r\   z InstructTokenizerV1.encode_thinki  s     !!MNNNr:   rC   c                 :    t          d| j        j                   )zEncode a FIM request.

        Raises:
           TokenizerException: The FIM request is not implemented for this version.
        zFIM not available for r   r0   versionr7   rC   s     r9   
encode_fimzInstructTokenizerV1.encode_fimq  s      !!R$.:P!R!RSSSr:   c                 :    t          d| j        j                   )Nz Transcription not available for r   r   s     r9   encode_transcriptionz(InstructTokenizerV1.encode_transcriptiony  s     !\DNDZ!\!\]]]r:   NF)r   r   r   r   r   rF   r   r   r   r   r   npndarrayr   rt   r   rw   r   r   r   rU   r   rX   r   r\   r   r(   r   r    r   ra   r:   r9   r   r      sB         %) %!) !)!) dd*!) 	!)
 !) Tz!) !) 
tCy$rz*DK7	8!) !) !) !)Fl] ltCy l l l l %) % t,--  Tz	
  
tCy$rz*DK7	8   4K; KUY K^bcf^g K K K K+JNbf	c   <O* Oc O O O OT* T T T T T^,@ ^Y ^ ^ ^ ^ ^ ^r:   r   c                       e Zd ZdZej        Z	 	 d!dededz  de	dz  f fdZ
	 	 d"ded	ee         dz  d
edededz  dedeee         eej                 ee         f         fdZdedefdZdeee         z  defdZdedeeef         fdZdededee         fdZdedeeef         fdZde dee         fdZ!de dee         fdZ"de dededee         fdZ#dedee         fdZ$de%de&fd Z' xZ(S )#InstructTokenizerV2z`Instruct tokenizer V2.

    This tokenizer adds supports to images, tools and FIM requests.
    Nr0   r1   r2   c                    t                                          |||           | j                            t          j        j                  | _        | j                            t          j        j                  | _	        | j                            t          j
        j                  | _        | j                            t          j        j                  | _        | j                            t          j        j                  | _        | j                            t          j        j                  | _        | j                            t          j        j                  | _        | j                            t          j        j                  | _        | j                            t          j        j                  | _        | j                            t          j        j                  | _        dS Initialize the tokenizer.

        Args:
            tokenizer: The tokenizer to use.
            image_encoder: The image encoder to use.
            audio_encoder: The audio encoder to use.
        N)r4   r5   r0   get_special_tokenr'   
begin_instvalue
BEGIN_INSTend_instEND_INSTbegin_toolsBEGIN_AVAILABLE_TOOLS	end_toolsEND_AVAILABLE_TOOLSbegin_tool_resultsBEGIN_TOOL_RESULTSend_tool_resultsEND_TOOL_RESULTSr   
TOOL_CALLSr   BOSr   PREFIXsuffixSUFFIXr6   s       r9   r5   zInstructTokenizerV2.__init__  sK    	M=AAA.::=;S;YZZ889O9UVV%)^%E%EmF_Fe%f%f"#'>#C#CMD[Da#b#b "&."B"B=CcCi"j"j $ @ @A_Ae f f.::=;S;YZZ>33M4E4KLLn66}7K7QRRn66}7K7QRRr:   FrO   ru   r   r   ri   rj   r;   c                    d}||o| j         t          j        k    z  }||o| j         t          j        k    z  }g }|rP|rNd |D             }	| j                            t          j        |	d          dd          }
| j        g|
| j	        }| 
                    |j        |||          \  }}}g || j        }| j        g}||z   |z   }|||fS )a  Encode a user message.

        Args:
            message: The message to encode.
            available_tools: The list of available tools if any.
            is_last: Whether the message is the last one.
            is_first: Not used.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and the list of images.
        Fc                 6    g | ]}|                                 S ra   )
model_dump).0tools     r9   
<listcomp>z;InstructTokenizerV2.encode_user_message.<locals>.<listcomp>  s"    CCC4T__&&CCCr:   ensure_asciir   )r   r   ri   rj   )&_user_message_position_to_encode_toolsr+   firstlastr0   r   jsondumpsr   r   r   r   r   r   )r7   rO   ru   r   r   ri   rj   do_encode_toolstools_tokenstoolstools_json_tokensrm   r   r   prefix_tokenssuffix_tokensr   s                    r9   rt   z'InstructTokenizerV2.encode_user_message  s    ,  8r)TXkXq)qr7p(SWjWo(op"$ 	 	CC?CCCE $ 5 5djUZ6[6[6[aflq 5 r r*" (L  $77O'+	  8  
  
u 9,88#f,}<E5((r:   r   c                 Z    	 t          j        |          S # t           j        $ r |cY S w xY wr=   )r   loadsJSONDecodeErrorr7   r   s     r9   _parse_json_contentz'InstructTokenizerV2._parse_json_content  s>    	:g&&&# 	 	 	NNN	s    **c                     t          |t                    rd                    d |D                       }|                     |          S )Nr   c              3   $   K   | ]}|j         V  d S r=   rn   r   rY   s     r9   	<genexpr>z:InstructTokenizerV2._parse_tool_content.<locals>.<genexpr>  $      >>Uej>>>>>>r:   )rI   rF   joinr   r   s     r9   _parse_tool_contentz'InstructTokenizerV2._parse_tool_content  sH    gt$$ 	?gg>>g>>>>>G''000r:   tool_messagec                 F    |j         |                     |j                  dS )z8Bit of a hack due to the way tool results are tokenized.)namer   )r   r   r   r7   r   s     r9   _prepare_tool_resultz(InstructTokenizerV2._prepare_tool_result  s-     !%//0DEE
 
 	
r:   rP   c                     |rg S t          j        |                     |          gd          }| j        g| j                            |dd          | j        }|S )a  Encode a tool message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Whether the message is before the last user message. If true, the message is
                not encoded.

        Returns:
            The encoded tokens.
        Fr   r   r   r   r   r   r0   r   r   r7   rO   rP   tool_result_strr   s        r9   rU   z'InstructTokenizerV2.encode_tool_message  sz     ' 	I *d&?&?&H&H%IX]^^^#
^""?5"II
 !

 r:   	tool_callc                 Z    |j         j        |                     |j         j                  dS )z:Bit of a hack due to the way function calls are tokenized.r   	arguments)functionr   r   r  )r7   r   s     r9   _prepare_function_callz*InstructTokenizerV2._prepare_function_call  s3     &+11)2D2NOO
 
 	
r:   c                     |j         sJ d|             t          |j         t                    s
J d            | j                            |j                             d          dd          S )Nz)Assistant message must have content. Got 3Message content must be a string for tokenizer < V7 Fr   )r   rI   r   r0   r   rstripr   s     r9   (_encode_normal_content_assistant_messagez<InstructTokenizerV2._encode_normal_content_assistant_message  so    UU UG U UUU'/3//ff1fff/~$$W_%;%;C%@%@eQV$WWWr:   c                    |j         sJ d|             g }|j         D ]*}|                    |                     |                     +t          j        |d          }| j        g| j                            |dd          }|S )N,Assistant message must have tool calls. Got Fr   r   )r   ry   r  r   r   r   r0   r   )r7   rO   prepared_tool_callsr   tool_call_strr   s         r9   '_encode_tool_calls_in_assistant_messagez;InstructTokenizerV2._encode_tool_calls_in_assistant_message  s    ![[#[RY#[#[[[!  + 	O 	OI&&t'B'B9'M'MNNNN
#6UKKKO
^""=e"GG
 r:   rV   c                    |j         r|j        rt          d|           |r|j        rt	          d          |j         r|rg S |                     |          }nX|j        r:t          |j        t                    s
J d            |                     |          }nt          d|j                   |j        s!|s|
                    | j        j                   |S )a  Encode an assistant message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Whether the message is before the last user message. If has tools and true, the
                message is not encoded.
            continue_message: Whether to continue the message generation.
                Only use this if the assistant message is the last message.

        Returns:
            The encoded tokens.
        zICannot have tool calls and content defined in the same assistant message r   r  Invalid assistant message: )r   r   
ValueErrorr   r	   r  rI   r   r
  r   ry   r0   r   r   s        r9   rX   z,InstructTokenizerV2.encode_assistant_message  s     	t'/ 	triprrsss 	 	2g    		V* 	FFwOOKK_ 	Vgos33jj5jjj3GGPPKK$%T7?%T%TUUU~ 	6&6 	6t~4555r:   rn   c                 R    | j                             d|z   dd          dd         S )z;Remove prefix space in the case of SentencePieceTokenizers.u   ☺Fr      N)r0   r   )r7   rn   s     r9   _encode_infillingz%InstructTokenizerV2._encode_infilling8  s.     ~$$UT\u%$HHLLr:   rC   c                    | j                             |j        dd          }|j        r|                     |j                  ng }| j        | j        g|| j        |}t          || 	                    |t          j                            S )zEncode a FIM request.

        Args:
            request: The request to encode.

        Returns:
            The encoded tokens.
        Fr   rk   )rm   rn   )r0   r   promptr   r  r   r   r   r(   r{   r&   r|   )r7   rC   r   r   rm   s        r9   r   zInstructTokenizerV2.encode_fim=  s     --gn%U-SSBI.X..w~>>>VXHK
 
 K	

 
 T[[VhVm[-n-noooor:   r   r   ))r   r   r   r   r+   r   r   r*   r,   r!   r5   r   rF   r   r   r   r   r   r   r   r   rt   r   r   r   r   r   dictr   rU   r   r  r   r
  r  rX   r  r   r(   r   r   r   s   @r9   r   r   }  s        
 .A-E*
 .2-1	S SS $d*S $d*	S S S S S S> %) %0) 0)0) dd*0) 	0)
 0) Tz0) 0) 
tCy$rz*DK7	80) 0) 0) 0)d3 3    13i+@ 1S 1 1 1 1

 
c3h 
 
 
 
; UY ^bcf^g    0
 
T#s(^ 
 
 
 
X@T XY]^aYb X X X X

?S 
X\]`Xa 
 
 
 
"+"JN"bf"	c" " " "HMc Md3i M M M M
p* p p p p p p p p pr:   r   c                       e Zd ZdZ	 	 ddededz  dedz  f fdZdede	e
ef         fd	Zd
ede	e
ef         fdZdededee         fdZdedededee         f fdZede
ez  ez  deee         ddf         fd            Zedeez  deee         ej        df         fd            Zedeez  deee         de f         fd            Zde
e!z  deee         ej        dz  e dz  f         fdZde"e!         deee         eej                 ee          f         fdZ#	 	 dde
ee$         z  dede
dz  dedeee         eej                 ee          f         f
 fdZ% xZ&S )InstructTokenizerV3zxInstruct tokenizer V3.

    The only difference with V2 tokenizer is that it encodes the tool messages differently.
    Nr0   r1   r2   c                 P    t                                          |||           dS )r   )r1   r2   N)r4   r5   r6   s       r9   r5   zInstructTokenizerV3.__init__Z  s*     	-}]]]]]r:   r   r;   c                     |j         j        |                     |j         j                  d}|j        r|j        dk    r
|j        |d<   |S )Nr  nullid)r  r   r   r  r  )r7   r   function_calls      r9   r  z*InstructTokenizerV3._prepare_function_calli  sX    &+11)2D2NOO
 

 < 	/ILF22"+,M$r:   r   c                 h    |j         
J d            |                     |j                  |j         dS )Nz7Tool message has to have the tool call id defined in v3)r   call_id)tool_call_idr   r   r   s     r9   r   z(InstructTokenizerV3._prepare_tool_resultt  sE    (446o444 //0DEE#0
 
 	
r:   rO   rP   c                     t          j        |                     |          d          }| j        g| j                            |dd          | j        }|S )a  Encode a tool message.

        Note:
            Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_tool_message] but tools
            are not wrapped in a list and the history is also tokenized.

        Args:
            message: The message to encode.
            is_before_last_user_message: Whether the message is before the last user message. If true, the message is
                not encoded.

        Returns:
            The encoded tokens.
        Fr   r   r   r   s        r9   rU   z'InstructTokenizerV3.encode_tool_message|  sh     *T%>%>w%G%GV[\\\#
^""?5"II
 !

 r:   rV   c                 J    t                                          |d|          S )a  Encode an assistant message.

        Note:
            Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_assistant_message] but
            always encode the tool history.
            continue_message: Whether to continue the message generation.
                Only use this if the assistant message is the last message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.

        Returns:
            The encoded tokens.
        F)r4   rX   )r7   rO   rP   rV   r8   s       r9   rX   z,InstructTokenizerV3.encode_assistant_message  s"    $ ww//@PQQQr:   rY   c                     d S r=   ra   r[   s     r9   _encode_content_chunkz)InstructTokenizerV3._encode_content_chunk  s    jmjmr:   c                     d S r=   ra   r[   s     r9   r&  z)InstructTokenizerV3._encode_content_chunk  s    nqnqr:   c                     d S r=   ra   r[   s     r9   r&  z)InstructTokenizerV3._encode_content_chunk  s    ililr:   c                    t          |t                    r | j                            |dd          d d fS t          |t                    r%| j                            |j        dd          d d fS t          |t                    r|                     |          d d fS t          |t          t          f          r5| j
        
J d            | 
                    |          }|j        |j        d fS t          |t          t          f          r5| j        
J d            |                     |          }|j        d |j        fS t#          d|           )NFr   z+Make sure to define a image encoder at initz+Make sure to define a audio encoder at initzUnknown chunk type: )rI   r   r0   r   r   rn   r   r\   r   r   r1   rm   r   r   r   r2   r   r  )r7   rY   img_encodingaudio_encodings       r9   r&  z)InstructTokenizerV3._encode_content_chunk  s_   eS!! 	=>((Eu(EEtTQQy)) 	=>((E(JJDRVVVz** 	=$$U++T477
M:;; 	=%113`111--e44L&(:D@@
M:;; 	=%113`111!//66N!($0DDD;E;;<<<r:   r   c                     g }g }g }|D ]^}|                      |          \  }}}|                    |           ||                    |           ||                    |           _|||fS r=   )r&  rv   ry   )	r7   r   rm   rp   r   rY   chunk_tokensmaybe_imagemaybe_audios	            r9   _encode_content_chunksz*InstructTokenizerV3._encode_content_chunks  s     #% 	* 	*E595O5OPU5V5V2L+{MM,'''&k***&[)))vu$$r:   Fr   ri   rj   c                 R   t          |t                    r#t                                          |||          S g }g }g }t	          |          dk    o!t          |d         t
          t          f          }|r|r|d         |d         g}d}	|D ]}
d}|	r+|r)|r'd}	|dz   }|| j                            |dd          z  }t          |
t          t          f          rL|rJ d	t          |
           d
            |                     |
          \  }}}|                    |           nft          |
t
          t          f          r/|                     |
          \  }}}|                    |           n|                     |
          d         }|                    |           |||fS )H  Encode a user content.

        Args:
            content: The content to encode.
            is_last: Whether the message is the last one.
            system_prompt: The system prompt.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and the images.
        r  rh   r   Tr   Fr   r   zEIt is not possible that `content` is non-empty when chunk is of type .)rI   r   r4   r   rs   r   r   r0   r   r   r   rx   r&  ry   rv   )r7   r   r   ri   rj   rm   rp   r   has_one_img_one_text_firstfirst_chunkrY   content_strr-  _chunk_audiochunk_imager8   s                   r9   r   z'InstructTokenizerV3.encode_user_content  s   $ gs## 	P77..wOOO#%%(\\Q%6%n:gajS]_lRm;n;n" 	/9 	/qz71:.G 	( 	(EK Sw S= S#+f4$.//E/RRR%*m!<== 
D&  j\`af\g\gjjj  04/I/I%/P/P,a[))))EJ#>?? D/3/I/I%/P/P,k1k****#99%@@CMM,''''vu$$r:   r   r   )'r   r   r   r   r*   r,   r!   r5   r   r  r   r   r  r   r   r   rF   r   rU   r   rX   r   r   r   r   r&  r   r   r   r   r   r   r   r   r   r0  r   r   r   r   s   @r9   r  r  R  s         .2-1	^ ^^ $d*^ $d*	^ ^ ^ ^ ^ ^	 	T#s(^ 	 	 	 	
 
c3h 
 
 
 
; UY ^bcf^g    .R+RJNRbfR	cR R R R R R( m3?Z+GmERVWZR[]acgRgLhmmm Xmq:+Eq%PTUXPY[][egkPkJlqqq Xql:+El%PTUXPY[_afPfJglll Xl=3+= =%S	SUS]`dSdfknrfrHrBs = = = =,%-%	tCy$rz*DK7	8% % % %( %) %3% 3%t,--3% 3% Tz	3%
 3% 
tCy$rz*DK7	83% 3% 3% 3% 3% 3% 3% 3% 3% 3%r:   r  c                       e Zd ZdZ	 	 d%dededz  dedz  ddf fdZdeee	         dz           d	ee
         d
e	de	ddf
dZdedee	         fdZ	 	 d&deee         z  dededz  dedeee	         eej                 ee         f         f
 fdZ	 	 d&dedee         dz  dedededz  dedeee	         eej                 ee         f         f fdZdedefdZdedefdZdeez  dedefdZdedefdZe d	ee!         ddfd            Z"e#d	ee!         defd             Z$de%d!edee	         fd"Z&de
d!ed#edee	         fd$Z' xZ(S )'InstructTokenizerV7a%  Instruct tokenizer V7.

    The difference with V3 tokenizer is that it encodes the system prompts differently:
    - in V7 the system prompts are treated as separate SystemMessages
    - they are no longer prepended to the last user message
    - they are printed between special tokens

    Nr0   r1   r2   r;   c                    t                                          |||           | j                            t          j        j                  | _        | j                            t          j        j                  | _	        | j                            t          j
        j                  | _        d| _        |<|j        j        s2| j                            t          j        j                  | _        dS dS dS r   )r4   r5   r0   r   r'   begin_systemr   BEGIN_SYSTEM
end_system
END_SYSTEMbegin_tool_contentBEGIN_TOOL_CONTENT
TRANSCRIBEaudio_configis_streaming
transcriber6   s       r9   r5   zInstructTokenizerV7.__init__  s     	M=AAA N<<]=W=]^^.::=;S;YZZ"&."B"B=CcCi"j"j$]-G-T$"n>>}?W?]^^DOOO %$$$r:   tokenized_messagesrH   r^   r_   c                 0   t          d D                       |z
  dt          dd ffd}d}dk    r|t                    k     r ||           |dz  }t          |dz
           t                    rl|t                    k     rYt          |         t                    s> ||           |dz  }|t                    k     rt          |         t                    >dk    r|t                    k     dk    rt          d          d S )Nc              3   8   K   | ]}|t          |          V  d S r=   )rs   )r   ts     r9   r   z?InstructTokenizerV7._truncate_for_max_tokens.<locals>.<genexpr>6  s(      JJAMc!ffMMMMJJr:   idxr;   c                     t          |          t                    rd S | k    rd S |          }|J t          |          z  d | <   d S r=   )rI   r   rs   )rK  r   r_   rH   to_droprG  s     r9   dropz:InstructTokenizerV7._truncate_for_max_tokens.<locals>.drop8  sc    (3-77 ---$S)C???s3xxG&*s###r:   r   rh   z+Input couldn't fit in truncate_at_max_token)sumr   rs   rI   r   r   )r7   rG  rH   r^   r_   rN  current_idxrM  s    `` `  @r9   rb   z,InstructTokenizerV7._truncate_for_max_tokens*  s`    JJ&8JJJJJZW	+c 	+d 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+ kkkCMM99D1K(;?3[AA % "CMM11*XkEZ\g:h:h1D%%%1$K "CMM11*XkEZ\g:h:h1 kkkCMM99 Q;;$%RSSS ;r:   rO   c                     | j         g}t          |j        x}t                    rt	          |          g}||                     |          d         z  }|                    | j                   |S )zEncode a system message.

        Args:
            message: The message to encode.

        Returns:
            The encoded tokens.
        r   r   )r>  rI   r   r   r   r0  ry   r@  )r7   rO   rm   r   s       r9   rw   z)InstructTokenizerV7.encode_system_messageS  sq     #$0g#66 	0 g.../G$--g66q99do&&&r:   Fr   r   ri   rj   c                 Z   |
J d            t          |t                    r#t                                          |||          S t	          |          dk    o!t          |d         t
          t          f          }|r|r|d         |d         g}|                     |          \  }}}|||fS )r2  N?in Tokenizer V7 we don't encode system prompts in user messagesr  rh   r   )rI   r   r4   r   rs   r   r   r0  )
r7   r   r   ri   rj   r4  rm   rp   r   r8   s
            r9   r   z'InstructTokenizerV7.encode_user_contentd  s    $ $$&g$$$gs## 	P77..wOOO%(\\Q%6%n:gajS]_lRm;n;n" 	/9 	/qz71:.G $ ; ;G D Dvu$$r:   ru   r   c                 |    |
J d            t                                          ||||d|          \  }}}	|||	fS )a  Encode a user message.

        Args:
            message: The message to encode.
            available_tools: The list of available tools if any.
            is_last: Whether the message is the last one.
            is_first: Whether the message is the first one.
            system_prompt: Not used.
            force_img_first: Whether to force the image to be first.

        Returns:
            The encoded tokens and the list of images.
        NrS  )r   r   ri   rj   )r4   rt   )r7   rO   ru   r   r   ri   rj   rm   rp   r   r8   s             r9   rt   z'InstructTokenizerV7.encode_user_message  s`    , $$&g$$$ % ; ;+ !< !
 !
 vu$$r:   rC   c                 H   | j         J d| j                     | j         j        j        t          j        k    r|                     |          S | j         j        j        t          j        k    r|                     |          S t          d| j         j        j        d          )a  
        Encodes an audio transcription request into a tokenized format.

        This method processes a transcription request containing audio data,
        encodes the user message, and returns the tokenized output.

        Args:
            request: The transcription request object containing
                the audio data to be encoded.

        Returns:
            Tokenized: The tokenized representation of the audio data, including processed audio and tokens
        Nz6Audio encoder must be defined, got self.audio_encoder=zxTranscription format should be one of 'instruct', 'streaming', got self.audio_encoder.audio_config.transcription_format=r3  )	r2   rD  transcription_formatr"   INSTRUCT_encode_instruct_transcription	STREAMING_encode_streaming_transcriptionr   r   s     r9   r   z(InstructTokenizerV7.encode_transcription  s     !--/jUYUg/j/j---*?CVC___66w???,AEXEbbb77@@@%H!.CH H H
 
 	
r:   c                 *   |j         t          j        k    sJ d|j                     | j        J | j        j         d            |                                 }|                     t          t          |j
                  g          g ddd           \  }}}g ||}|j        *d|j         }|| j                            |dd	          z  }|                    | j                   t          || j                            |          |
          S )Nz=Request must not be in streaming mode, got request.streaming=z! needs to have a TRANSCRIBE token)input_audio)r   T)ru   r   r   ri   zlang:Fr   rm   rn   rq   )	streamingr   DISABLEDrC  r8   r   rB   rt   r   r   r   languager0   r   ry   r(   r   )r7   rC   r   rm   r7  r   language_strings          r9   rX  z2InstructTokenizerV7._encode_instruct_transcription  s2    M$::::N':KNN ;:: **t~/F,i,i,i***33!F!F!F GHHH 4 
 
5 $6#F#'8g&688Odn++OE+RRRFdo&&&T^-F-Fv-N-NW\]]]]r:   r   is_online_streamingc                    | j         J d| j                     t          |t                    rt          j        |          nt          j        |          }| j                             ||          }t          |j        |j	        g          S )NzFAudio encoder must be defined to encode audio, got self.audio_encoder=rb  )rm   rq   )
r2   rI   r   r   from_base64
from_bytesencode_audior(   rm   r   )r7   r   rb  _audio	audio_encs        r9   _encode_audioz!InstructTokenizerV7._encode_audio  s    !--W$BTWW .-- .8s-C-C`"5)))IYZ_I`I`&33FPc3dd	#O$
 
 
 	
r:   c                 `   |j         t          j        k    sJ d|j                     |                     |j        j        |j         t          j        k              }|                                 |j        z   }t          || 
                    |t          j                  |j                  }|S )Nz9Request must be in streaming mode, got request.streaming=rd  rk   r]  )r^  r   r_  rj  r   dataONLINErB   rm   r(   r{   r&   r|   rq   )r7   rC   r]   rm   s       r9   rZ  z3InstructTokenizerV7._encode_streaming_transcription  s     M$::::Jg6GJJ ;:: &&MG4EI]4] ' 
 
	
 	 00V:L:QRR#
 
 
	
 r:   c                     |                      |          r(t          d |D                       rt          d          d S d S )Nc              3   @   K   | ]}t          |t                    V  d S r=   )rI   r   r   rO   s     r9   r   z8InstructTokenizerV7.validate_messages.<locals>.<genexpr>  s,      NN':g}55NNNNNNr:   z9System messages are not yet allowed when audio is present)
_has_audioanyr  rd   s     r9   rf   z%InstructTokenizerV7.validate_messages  s^    >>(## 	^NNXNNNNN ^ !\]]]	^ 	^^ ^r:   c                 4    t          d | D                       S )Nc              3      K   | ]Q}t          |t                    o7t          |j        t                    ot	          d  |j        D                       V  RdS )c              3   @   K   | ]}t          |t                    V  d S r=   )rI   r   r   s     r9   r   z;InstructTokenizerV7._has_audio.<locals>.<genexpr>.<genexpr>  s,      OOeJuj11OOOOOOr:   N)rI   r   r   rF   rr  rp  s     r9   r   z1InstructTokenizerV7._has_audio.<locals>.<genexpr>  sy       
 
  w,, P7?D11POOwOOOOO
 
 
 
 
 
r:   )rr  )rH   s    r9   rq  zInstructTokenizerV7._has_audio  s2     
 
 $	
 
 
 
 
 	
r:   rP   c                 $   |j         J t          |j        t                    s
J d            | j                            |j         dd          }| j                            |j        dd          }| j        g|| j        }g ||| j        }|S )a  Encode a tool message.

        Note:
            Same as [V3][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV3.encode_tool_message]
            but tools are not wrapped in a list and history is also tokenized

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.

        Returns:
            The encoded tokens.
        Nr   Fr   )	r"  rI   r   r   r0   r   r   rB  r   )r7   rO   rP   tool_call_id_tokensrm   r   r   s          r9   rU   z'InstructTokenizerV7.encode_tool_message  s     #///'/3//UU1UUU/"n33G4HeY^3__&&wEu&MM #
 
 #




 !

 r:   rV   c                    |j         s|j        st          d|           |r|j        rt	          d          g }|j         rmt          |j         t                    r|                     |          }n=t          |j         t                    r#|| 	                    |j                   d         z  }|j        r|| 
                    |          z  }|j        s!|s|                    | j        j                   |S )r   r  r   r   )r   r   r   r   r	   rI   r   r
  rF   r0  r  ry   r0   r   r   s        r9   rX   z,InstructTokenizerV7.encode_assistant_message!  s     	Nw'9 	N$%L7%L%LMMM 	 	2g   ? 	O'/3// O"KKGTTGOT22 Ot::7?KKANN 	Q4GGPPPK~ 	6&6 	6t~4555r:   r   r   ))r   r   r   r   r*   r,   r!   r5   rF   r   r   rb   r   rw   r   r   r   r   r   r   r   r   r   r   rt   r    r(   r   rX  bytesrj  rZ  r   r   rf   r   rq  r   rU   rX   r   r   s   @r9   r;  r;  	  s         .2-1	_ __ $d*_ $d*	_
 
_ _ _ _ _ _.'T cT!12'T +,'T 	'T
 "%'T 
'T 'T 'T 'TR] tCy    * %) %% %t,--% % Tz	%
 % 
tCy$rz*DK7	8% % % % % %J %) %!% !%!% dd*!% 	!%
 !% Tz!% !% 
tCy$rz*DK7	8!% !% !% !% !% !%F
,@ 
Y 
 
 
 
2^6J ^y ^ ^ ^ ^,

3; 

T 

i 

 

 

 

7K PY    $ ^d ^ ^ ^ ^ [^
 
T$Z 
D 
 
 
 \
; UY ^bcf^g    > + JN bf 	c               r:   r;  c            	       b     e Zd ZdZ	 	 d
dededz  dedz  ddf fdZdede	e
         fd	Z xZS )InstructTokenizerV11zInstruct tokenizer V11.

    The difference with V7 tokenizer is that it encodes tool calls differently:
    Tool call results are encoded as :
    - [begin tool call] call_name_tokens [call id] call_id_tokens [args] content tokens
    Nr0   r1   r2   r;   c                    t                                          |||           | j                            t          j        j                  | _        | j                            t          j        j                  | _	        d S r=   )
r4   r5   r0   r   r'   argsr   ARGSr!  CALL_IDr6   s       r9   r5   zInstructTokenizerV11.__init__L  s_     	M=AAAN44]5G5MNN	~778M8STTr:   rO   c           	         |j         sJ d|             g }|j         D ]}|                     |          }g }d|v r+| j        g| j                            |d         dd          }|| j        g| j                            |d         dd          || j        | j                            t          j        |d         d          dd          z  }|S )Nr  r  Fr   r   r  r   )	r   r  r  r0   r   r   r~  r   r   )r7   rO   r   r   preparedidss         r9   r  z<InstructTokenizerV11._encode_tool_calls_in_assistant_messageV  s   ![[#[RY#[#[[[! + 	 	I229==HCx|bdn&;&;HTNPU[`&;&a&ab&&x'7U&NN  		
 &&tz(;2GV['\'\'\bgmr&ss KK r:   r   )r   r   r   r   r*   r,   r!   r5   r   rF   r   r  r   r   s   @r9   r{  r{  D  s          .2-1	U UU $d*U $d*	U
 
U U U U U U?S X\]`Xa        r:   r{  c            	            e Zd ZdZej        Z	 	 ddededz  de	dz  ddf fdZ
dedee         fd	Zded
edee         fdZdedee         fdZ xZS )InstructTokenizerV13zInstruct tokenizer V13.

    The difference with V11 tokenizer is that it encodes tool calls differently:
        - available tools are tokenized at the first user message.
        - call id is no longer tokenized for tool calls or results.
    Nr0   r1   r2   r;   c                    t                                          |||           t          |t                    sJ dt	          |                       t
          j        j        |j        v rlt
          j	        j        |j        v rT|
                    t
          j        j                  | _        |
                    t
          j	        j                  | _        d S d | _        d | _        d S )Nz$Tokenizer must be a Tekkenizer. Got )r4   r5   rI   r-   rx   r'   begin_thinkr   _special_tokens_reverse_vocab	end_thinkr   BEGIN_THINK	END_THINKr6   s       r9   r5   zInstructTokenizerV13.__init__t  s     	M=AAA)Z00jj2jY]^gYhYh2j2jjj0%+y/VVV'-1XXX+4+F+F}G`Gf+g+gD)2)D)D]E\Eb)c)cDNNN#D!DNNNr:   rO   c           	      p   |j         sJ d|             g }|j         D ]}|j        r|j        dk    sJ |                     |          }|| j        g| j                            |d         dd          | j        | j                            t          j        |d         d          dd          z  }|S )Nr  r  r   Fr   r  r   )	r   r  r  r   r0   r   r~  r   r   )r7   rO   r   r   r  s        r9   r  z<InstructTokenizerV13._encode_tool_calls_in_assistant_message  s    ![[#[RY#[#[[[! + 		 		I<:ILF$:$:$::229==H&&x'7U&NN 	 &&tz(;2GV['\'\'\bgmr&ss	 KK r:   rP   c                     |j         
J d            |j        }t          |t                    sd                    d |D                       }| j                            |dd          }| j        g|| j        }|S )zEncode a tool message.

        Args:
            message: The message to encode.
            is_before_last_user_message: Not used.
        Returns:
            The encoded tokens.
        Nz2Tool call id must be provided for tokenizer >= v13r   c              3   $   K   | ]}|j         V  d S r=   r   r   s     r9   r   z;InstructTokenizerV13.encode_tool_message.<locals>.<genexpr>  r   r:   Fr   )	r"  r   rI   r   r   r0   r   r   r   )r7   rO   rP   r   rm   r   s         r9   rU   z(InstructTokenizerV13.encode_tool_message  s     #//1e////'3'' 	?gg>>g>>>>>G&&wEu&EE#

 !

 r:   rY   c                     | j         
J d            | j        
J d            | j                            |j        dd          }| j         g|}|j        r|                    | j                   |S )zEncode a thinking chunk.

        Args:
            chunk: The thinking chunk to encode.
        Returns:
            The encoded tokens.
        Nz2think tokens are not available for this tokenizer.Fr   )r  r  r0   r   thinkingclosedry   )r7   rY   rm   think_tokenss       r9   r\   z!InstructTokenizerV13.encode_think  s     ++-a+++~))+_)))&&u~5e&LL(262< 	0///r:   r   )r   r   r   r   r+   r   r   r*   r,   r!   r5   r   rF   r   r  r   r   rU   r   r\   r   r   s   @r9   r  r  j  s         .A-F*
 .2-1	" "" $d*" $d*	"
 
" " " " " "$?S X\]`Xa    ; UY ^bcf^g    .* c        r:   r  )Cr   abcr   typingr   r   r   r   numpyr   mistral_common.audior   mistral_common.exceptionsr	   r
   r   r   #mistral_common.protocol.fim.requestr   &mistral_common.protocol.instruct.chunkr   r   r   r   r   r   r   r   )mistral_common.protocol.instruct.messagesr   r   r   r   r   r   (mistral_common.protocol.instruct.requestr   +mistral_common.protocol.instruct.tool_callsr   r   -mistral_common.protocol.transcription.requestr   r    &mistral_common.tokens.tokenizers.audior!   r"   %mistral_common.tokens.tokenizers.baser#   r$   r%   r&   r'   r(   r)   r*   r+   &mistral_common.tokens.tokenizers.imager,   'mistral_common.tokens.tokenizers.tekkenr-   r/   r   r   r  r;  r{  r  ra   r:   r9   <module>r     s          3 3 3 3 3 3 3 3 3 3 3 3     & & & & & &            ; : : : : :	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	                E D D D D D F F F F F F F F ] ] ] ] ] ] ] ] T T T T T T T T
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 @ ? ? ? ? ? > > > > > >D1 D1 D1 D1 D1w2NMSgghD1 D1 D1N^ ^ ^ ^ ^7#6Wk#kl^ ^ ^DRp Rp Rp Rp Rp!4nmUi!ijRp Rp Rpjt% t% t% t% t%!4nmUi!ijt% t% t%nx x x x x- x x xv	# # # # #. # # #LP P P P P/ P P P P Pr:   