
     `i                     l    d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ  G d d          ZdefdZd	S )
    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)LlamaTokenizerFast)bytes_to_unicodec                   @    e Zd ZdZ	 	 	 	 ddZdefdZd Zd	efd
Z	dS )MistralConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 >    || _         || _        || _        || _        d S )N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   kwargss         u/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/integrations/mistral.py__init__zMistralConverter.__init__   s(     
 0)B&&&    r   c                 H  
 |
t                      fdg }i }t          
                                          D ]\  }\  }}|| j        vr|| |          <   t	          |          dk    r3g }t          dt	          |                    D ]=}|d |         ||d          }	}|
v r#|	
v r||	z   
v r|                    ||	|f           >t          |
fdd          }|                    |           |||<   t          |d d          }fd|D             }||fS )Nc                 l    d                     fd|                     d          D                       S )N c                 :    g | ]}t          |                   S  )ord).0charbyte_encoders     r   
<listcomp>zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>   s$    TTTLT3TTTr   zlatin-1)joindecode)br    s    r   token_bytes_to_stringzOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   s6    77TTTT@S@STTTUUUr      c                 <    | d                  | d                  fS )Nr   r&   r   )x	bpe_rankss    r   <lambda>zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>-   s    Yqt_iPQRSPTo4V r   F)keyreversec                     | d         S )N   r   )vals    r   r*   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>1   s
    A r   c                 T    g | ]$} |d                     |d                   f%S )r   r&   r   )r   r/   r%   s     r   r!   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>2   s?    cccUX((Q002G2GA2O2OPcccr   )	r
   	enumerateitemsr   lenrangeappendsortedextend)r   r   mergesidxtokenranklocalindexpiece_lpiece_rr)   r    r%   s             @@@r   extract_vocab_merges_from_modelz0MistralConverter.extract_vocab_merges_from_model   s   	'))	V 	V 	V 	V 	V "+IOO,=,=">"> 	# 	#C%D:::69++E223u::??"1c%jj11 ? ?E',VeV}eEFFmWG)++90D0D'T[J[`iIiIigw%=>>>u*V*V*V*V`efffe$$$$"e$6$6FFFcccc\bcccf}r   c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S )NF)fuse_unkignore_mergesT)r@   r   r   r   hasattrmodelrC   )r   vocab_scoresr8   	tokenizers       r   rG   zMistralConverter.tokenizer5   s\    #CCDJOOfc,GGGHH	9?O44 	1,0IO)r   returnc                 |   |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        |                    | j                   t          j        d          |_        |S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rG   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rG   s     r   	convertedzMistralConverter.converted<   s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	$$T%CDDD#-#7U#K#K#K	 r   )Nr   FN)
__name__
__module____qualname____doc__r   strr@   rG   r   rV   r   r   r   r   r      s           K"&C C C CS    6  9      r   r   tokenizer_filec                 p   ddl m} |                    |           }|j        j        j        }d |j        j        j        D             fdD             }|                    |           |}t          t          |          
                                          }|                    di           |S )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )MistralTokenizerc                 @    g | ]}t          |d           r|j        n|S )value)rD   r`   )r   r:   s     r   r!   z,convert_tekken_tokenizer.<locals>.<listcomp>W   s=        ug..9E  r   c                 <    i | ]}|                     |          S r   )r=   )r   r:   all_specials     r   
<dictcomp>z,convert_tekken_tokenizer.<locals>.<dictcomp>[   s)    PPP5uk//66PPPr   )r   r   )tokenizer_objectr   )(mistral_common.tokens.tokenizers.mistralr^   	from_fileinstruct_tokenizerrG   _tekken_token2id_nospecial_all_special_tokensupdater	   r   rV   rT   )r\   r^   mistral_tokenizerr   specials_tokensrG   rb   s         @r   convert_tekken_tokenizerrm   L   s     JIIIII )22>BB 0:UE &9CW  K QPPPKPPPO5!!!E #)Q\]]]ggii  I
   "={!KLLLr   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   transformersr	   #transformers.convert_slow_tokenizerr
   r   r[   rm   r   r   r   <module>rr      s    M M M M M M M M M M M M M M ! ! ! ! ! ! + + + + + + @ @ @ @ @ @A A A A A A A AHS      r   