
     `iZ                    4   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZmZmZ d	dlmZ  ej        e          ZddZde de!fdZ"d Z# G d d          Z$ G d de$          Z%de!de fdZ& G d d          Z' G d de'          Z( G d de'          Z) G d de'          Z* G d  d!e'          Z+ G d" d#e'          Z, G d$ d%e'          Z- G d& d'e'          Z. G d( d)e'          Z/ G d* d+e'          Z0 G d, d-e'          Z1 G d. d/e'          Z2 G d0 d1e'          Z3 G d2 d3e3          Z4 G d4 d5e3          Z5 G d6 d7e3          Z6 G d8 d9e3          Z7 G d: d;e3          Z8 G d< d=e3          Z9 G d> d?e3          Z: G d@ dAe3          Z; G dB dCe3          Z< G dD dEe3          Z= G dF dGe3          Z> G dH dIe3          Z? G dJ dKe3          Z@ G dL dMe3          ZA G dN dOe3          ZB G dP dQe3          ZC G dR dSe'          ZD G dT dUe3          ZE G dV dWe'          ZF G dX dYe'          ZG G dZ d[e'          ZH G d\ d]e3          ZI G d^ d_e3          ZJ G d` dae3          ZK G db dce'          ZL G dd dee3          ZM G df dge3          ZN G dh die3          ZOdj ZP G dk dl          ZQ G dm dn          ZRi doe4dpe0dqe5dre(dseEdteHdue6dveFdwe-dxe(dye2dze7d{e(d|e(d}e(d~e(de(i de4de*de-de.de(de(de0de<de0de0de(deLde8de9de+de(de0i de:de,deAde/de(de>de?de(de0de1de;de(deBdeCdeDde<de=e)eIeKeKeJeKdZSddefdZTdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)	lru_cache)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                    t                      rddlm} |S t                      rGdd l}t          j        |j        j                  t          j        d          k     rddl	m} nddl	m
} |S t          t          j        |                     )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      w/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr&   %   s    !## '999999&& 	G=455g8N8NNNBBBBBBBaaaaaa&&/6}EEFFF    add_prefix_spacereturnc                 :    | rd}t          |dd          sd}nd}|S )NalwayslegacyTfirstnever)getattr)r(   original_tokenizerprepend_schemes      r%   _get_prepend_schemer2   6   s5     !!)8T:: 	%$N r'   c                     |d u}|rt          |          n }g }|                                D ]\  }}g }t          dt          |                    D ]6}|d |         ||d          }	}| v r|	 v r|                    ||	|f           7t          | fd          }|                    |           t          |d |          }d |D             }|S )Nr   c                 <    | d                  | d                  fS Nr   r    )xvocabs    r%   <lambda>z!generate_merges.<locals>.<lambda>K   s    U1Q4[%!+,F r'   keyc                 d    | d         t          | d                   t          | d                   fS )N   r   r   )lenvals    r%   r9   z!generate_merges.<locals>.<lambda>N   s%    SVSQ[[#c!f++,N r'   r;   reversec                 .    g | ]}|d          |d         fS r   r   r6   ).0r@   s     r%   
<listcomp>z#generate_merges.<locals>.<listcomp>O   s%    1113s1vs1v111r'   )dictitemsranger>   appendsortedextend)
r8   vocab_scoresrB   mergesmergepiece_scorelocalindexpiece_lpiece_rs
   `         r%   generate_mergesrU   @   s   $&G)0;4%%%eLF*0022  {1c%jj)) 	> 	>E$VeV}eEFFmWG%Gu$4$4gw<===u"F"F"F"FGGGeF N NX_```F11&111FMr'   c                   X    e Zd ZdZdefdZddeeeef         e	e         f         fdZ
dS )SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                     t          | d           ddlm}  |            | _        | j                            |           d S )Nr   r   )SentencePieceProcessor)r   r   rZ   spLoad)selfrX   rZ   s      r%   __init__zSentencePieceExtractor.__init__X   sN    $000888888((**Ur'   Nr)   c                     | j         fdt                                                    D             }t          ||          }||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                 <    i | ]}                     |          |S r6   id_to_piecerE   rR   r[   s     r%   
<dictcomp>z2SentencePieceExtractor.extract.<locals>.<dictcomp>e   '    TTT%&&TTTr'   )r[   rI   GetPieceSizerU   r]   rM   r8   rN   r[   s       @r%   extractzSentencePieceExtractor.extract_   sP    
 WTTTT5ARAR;S;STTT 55f}r'   N)__name__
__module____qualname____doc__strr^   tuplerG   intlistri   r6   r'   r%   rW   rW   S   so         c    
 
E$sCx.$u+2M,N 
 
 
 
 
 
r'   rW   c                   H    e Zd Zddeeeef         ee         f         fdZdS )GemmaSentencePieceExtractorNr)   c                     | j         fdt                                                    D             }d|vr|                    d          |d<   t	          ||          }||fS )r`   c                 <    i | ]}                     |          |S r6   rb   rd   s     r%   re   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>s   rf   r'   	<0x09>)r[   rI   rg   getrU   rh   s       @r%   ri   z#GemmaSentencePieceExtractor.extractm   sr    
 WTTTT5ARAR;S;STTT u))H--E$K 55f}r'   rj   )	rk   rl   rm   rp   rG   ro   rq   rr   ri   r6   r'   r%   rt   rt   l   sJ         E$sCx.$u+2M,N      r'   rt   piecec                 v    t          |           dk     p&| d         dk    p| d                                          S )Nr=   ,)r>   isdigit)rz   s    r%   check_number_commar   }   s8    u::>HU2Y#-HU2Y5F5F5H5H1HHr'   c                        e Zd Zd ZdefdZdS )	Converterc                     || _         d S rj   )r0   )r]   r0   s     r%   r^   zConverter.__init__   s    "4r'   r)   c                     t                      rj   )NotImplementedErrorr]   s    r%   	convertedzConverter.converted   s    !###r'   N)rk   rl   rm   r^   r   r   r6   r'   r%   r   r      s>        5 5 5$9 $ $ $ $ $ $r'   r   c                       e Zd ZdefdZdS )BertConverterr)   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr0   r8   r   r   ro   r   hasattrr   tokenize_chinese_charsr   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
r]   r8   	tokenizerr   r   r   clssepr   r   s
             r%   r   zBertConverter.converted   y   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r'   Nrk   rl   rm   r   r   r6   r'   r%   r   r      /        #9 # # # # # #r'   r   c                       e Zd ZdefdZdS )SplinterConverterr)   c           
         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }t	          | j         j                  }d}	| j         j        }
| j         j        }| j         j        }| j                             d          }| j         j        dk    r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3          j        | d| d|||
f||f||f|	|fg          |_        t9          j        d          |_        |S )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )r0   r8   r   r   ro   r   r   r   r   r   r   r
   r   r   r   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r	   r   )r]   r8   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r%   r   zSplinterConverter.converted   s   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344t.=>>.;.; 3E.DDSII"/7::HH8HHcHHCHHHHHDDHH3HHHH3HHHHHD#-#@**3***l#l#,-l#		$
 	$
 	$
	  %.d;;;	r'   Nr   r6   r'   r%   r   r      s/        .9 . . . . . .r'   r   c                       e Zd ZdefdZdS )FunnelConverterr)   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r%   r   zFunnelConverter.converted   r   r'   Nr   r6   r'   r%   r   r      r   r'   r   c                       e Zd ZdefdZdS )MPNetConverterr)   c                    | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	| d
||f||	fg          |_        t1          j        d          |_        |S )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r%   r   zMPNetConverter.converted
  s   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***======c===l#l#$
 $
 $
	  %.d;;;	r'   Nr   r6   r'   r%   r   r   	  r   r'   r   c                       e Zd ZdefdZdS )OpenAIGPTConverterr)   c           
         | j         j        }t          | j         j                                                  }| j         j        }t          t          ||d t          |          dd                    }|	                    t          |                    #|
                    t          |          g           t          j        d          |_        t          j                    |_        t#          j        d          |_        |S )N</w>F)r8   rN   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r0   encoderrr   	bpe_rankskeysr   r   r   ro   token_to_idadd_special_tokensr
   r   r   r   r   r   r	   
BPEDecoderr   r]   r8   rN   r   r   s        r%   r   zOpenAIGPTConverter.converted1  s    '/d-7<<>>??+5	i..#)  	
 	
	   Y00<((#i..)9:::*9DIII	"0"A"C"C	$/v>>>	r'   Nr   r6   r'   r%   r   r   0  s/        9      r'   r   c            	       j    e Zd Z	 ddeeeef                  deeeeef                           de	fdZ
dS )GPT2ConverterNr8   rN   r)   c           
         |s| j         j        }|st          | j         j                  }t	          t          ||d ddd                    }t          | j         dd          }t          j        |          |_	        t          j                    |_        t          | j         dd          r>| j         j        }| j         j        }t          j        | d| d||fg	          |_        nt          j        d
          |_        |S )Nr   Fr8   rN   r   continuing_subword_prefixr   r   r(   r(   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r0   r   rr   r   r   r   r/   r   	ByteLevelr   r	   r   	bos_tokenbos_token_idr   r   r   )r]   r8   rN   r   r(   bosr   s          r%   r   zGPT2Converter.convertedL  s1     	4+3E 	=$1;<<F*,#%  	
 	
	 #4#:<NPUVV"0":L\"]"]"]	$.00	4*OUCC 	P)3C2?L'1'D))),' ( ( (I$$ (2';'O'O'OI$r'   NNrk   rl   rm   r   rG   ro   rq   rr   rp   r   r   r6   r'   r%   r   r   K  sj        `d$ $d38n-$>FtERUWZRZOG\>]$	$ $ $ $ $ $r'   r   c                       e Zd ZdefdZdS )HerbertConverterr)   c           	      .   d}d}| j         j        }t          | j         j                                                  }||d         d         v r
|dd          }t          t          ||d | j         j        |                    }t          j	        dd          |_
        t          j                    |_        t          j        |          |_        t#          j        | j         j        | j         j        f| j         j        | j         j        f	          |_        |S )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r0   r   rr   r   r   r   r   r   r
   r   r   r   r   r   r	   r   r   r   BertProcessingr   r   r   r   r   )r]   tokenizer_info_strtoken_suffixr8   rN   r   s         r%   r   zHerbertConverter.convertedt  s   ('/d-7<<>>??1--ABBZF1;#/  
 
	  +9EY^___	"0"A"C"C	$/|DDD	#-#<(2D4K4XY(2D4K4XY$
 $
 $
	 
 r'   Nr   r6   r'   r%   r   r   s  /        9      r'   r   c            	       j    e Zd Z	 ddeeeef                  deeeeef                           de	fdZ
dS )Qwen2ConverterNr8   rN   r)   c                 "   |s| j         j        }|s+t          | j         j                                                  }t          t          ||d d dddd                    }t          j                    |_	        t          j        t          j        t          d          dd          t          j        t          | j         dd          d          g          |_        t#          j                    |_        t'          j        d	          |_        |S )
Nr   F)r8   rN   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr(   r(   	use_regexr   )r0   r   rr   r   r   r   r   r
   NFCr   r   SequenceSplitr   r   r/   r   r	   r   r   r   )r]   r8   rN   r   s       r%   r   zQwen2Converter.converted  s1     	4+3E 	D$1;@@BBCCF*,#%#	 	 	
 
	  +00	"0"9$ N  (    (%,T-DFXZ_%`%`#  #
 #
	  %.00	#-#7U#K#K#K	 r'   r   r   r6   r'   r%   r   r     sj        `d* *d38n-*>FtERUWZRZOG\>]*	* * * * * *r'   r   c                       e Zd ZdefdZdS )RobertaConverterr)   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        |j        |j        f|j        |j        f|j	        d          |_        |S )Nr   Fr   r   Tr   r   r(   r   )r0   r   rr   r   r   r   r   r   r   r(   r   r	   r   r   RobertaProcessingr   r   r   r   r   r]   otr8   rN   r   s        r%   r   zRobertaConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#?r/r/0	$
 $
 $
	  r'   Nr   r6   r'   r%   r   r     /        9      r'   r   c                       e Zd ZdefdZdS )RoFormerConverterr)   c           	         ddl m} | j        j        }t	          t          |t          | j        j                                      }d}d}t          | j        d          r"| j        j	        j
        }| j        j	        j        }t          j        dd||          |_        t          j                             ||                    |_        t          | j        j                  }t          | j        j                  }| j        j        }| j        j        }	t/          j        | d| d	| d| d
| d||f||	fg          |_        t5          j        d          |_        |S )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr  r0   r8   r   r   ro   r   r   r   r   r   r
   r   r   r   PreTokenizercustomr   r   r   r   r   r   r   r   r	   r   )
r]   r  r8   r   r   r   r   r   r   r   s
             r%   r   zRoFormerConverter.converted  s   IIIIII'-iT=T=^9_9_```aa	4*,=>> 	R 3CQM 3CQM*9!&'#	 
  
  
	 #1"="D"DEVEVW\E]E]"^"^	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r'   Nr   r6   r'   r%   r  r    r   r'   r  c                       e Zd ZdefdZdS )DebertaConverterr)   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        ddd| j                             d          fd| j                             d          fg	          |_        |S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r0   r   rr   r   r   r   r   r   r   r(   r   r	   r   r   r   r   r   r   s        r%   r   zDebertaConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#@)4$1GGPPQ$1GGPPQ$
 $
 $
	  r'   Nr   r6   r'   r%   r
  r
    r   r'   r
  c                   `     e Zd ZdZeZi Z fdZd Zd Z	d Z
d Zd Zd Zd	 Zd
efdZ xZS )SpmConverterFc                    t          | d            t                      j        |  t                      }|                                }t          | j        j        d          5 }|                    |	                                           d d d            n# 1 swxY w Y   || _
        | j
        j        j        r| j        st          j        d           d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr^   r&   
ModelProtoopenr0   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)r]   args	model_pb2mf	__class__s        r%   r^   zSpmConverter.__init__(  s"   $
+++$ $%%	  ""$)4d;; 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(
:"0 	9R 	Me    	 	 	 	s   $(BBBc                 $    d |j         D             S )Nc                 *    g | ]}|j         |j        fS r6   rz   scorerE   rz   s     r%   rF   z&SpmConverter.vocab.<locals>.<listcomp>>  s!    EEEuek*EEEr'   piecesr]   r  s     r%   r8   zSpmConverter.vocab=  s    EEEEEEr'   c                     |j         j        S rj   )r  unk_idr+  s     r%   r-  zSpmConverter.unk_id@  s    !((r'   c           
          |j         j        }                     |          }|dk    r8t          t	          |                     |           j                            }n|dk    r                      j        j	                  
                    |          \  }}d t          |          D             }t          t          |||j         j        d j        d                     }nt          d           fdt          |j                  D             }|                    d	 t#          |d
           D                        |S )Nr   r-  r   r=   c                      i | ]\  }\  }}||S r6   r6   rE   iwordr'  s       r%   re   z*SpmConverter.tokenizer.<locals>.<dictcomp>R  s#    QQQ%5QuqQQQr'   Tr   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S )      r7  typerz   r   rE   idpr]   s      r%   rF   z*SpmConverter.tokenizer.<locals>.<listcomp>g  R     
 
 
Av !&A+GD4G)GHr'   c                 :    g | ]\  }}}t          |d |          S F
normalizedspecialr   rE   r<  tokenrC  s       r%   rF   z*SpmConverter.tokenizer.<locals>.<listcomp>m  =       &Bw 5UGDDD  r'   c                     | d         S Nr   r6   r7   s    r%   r9   z(SpmConverter.tokenizer.<locals>.<lambda>o      QRSTQU r'   r:   )r  
model_typer8   r   r   r-  r  SpmExtractorr0   r  ri   	enumerater   	unk_piece	Exceptionr*  
add_tokensrK   )	r]   r  rL  rM   r   _rN   	bpe_vocabspm_added_tokenss	   `        r%   r   zSpmConverter.tokenizerC  s   '2
zz%((??! ;;u--"&";   II 1__))$*A*LMMUUVbccIAvQQ<9P9PQQQI!#0:!"&";   	 	II o  
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 r'   c                 
   |j         j        }t          j        dd          t          j        t          d          d          g}|st          j        |          S t          j        t          j        |          g|z             S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr
   StripReplacer   r   Precompiledr]   r  rZ  _normalizerss       r%   r   zSpmConverter.normalizeru  s    $4I5555g66
 $ 	h'555')@AU)V)V(WZf(fgggr'   c                 X    t          || j                  }t          j        ||          S Nreplacementr1   )r2   r0   r   	Metaspacer]   rc  r(   r1   s       r%   r   zSpmConverter.pre_tokenizer  s,    ,-=t?VWW'KP^____r'   c                     d S rj   r6   r   s    r%   r   zSpmConverter.post_processor  s    tr'   c                 X    t          || j                  }t          j        ||          S ra  )r2   r0   r	   rd  re  s       r%   r   zSpmConverter.decoder  s+    ,-=t?VWW!k.YYYYr'   r)   c                 x   |                      | j                  }|                     | j                  }|||_        d}d}t          | j        d          r| j        j        }|                     ||          }|||_        |                     ||          |_        |                                 }|r||_        |S )NrX  Tr(   )	r   r  r   r   r0   r(   r   r   r   )r]   r   r   rc  r(   r   r   s          r%   r   zSpmConverter.converted  s    NN4:..	 __TZ00
!#-I 4*,>?? 	H#6G**;8HII$&3I# LL6FGG	,,.. 	6'5I$r'   )rk   rl   rm   r  rW   rM  r   r^   r8   r-  r   r   r   r   r   r   r   __classcell__)r#  s   @r%   r  r  #  s         )LN    *F F F) ) )0 0 0d	h 	h 	h` ` `  Z Z Z9        r'   r  c                        e Zd Zd Zd Zd ZdS )AlbertConverterc                 $    d |j         D             S )Nc                 t    g | ]5}t          |j                  r|j        |j        fn|j        |j        d z
  f6S d   r   rz   r'  r(  s     r%   rF   z)AlbertConverter.vocab.<locals>.<listcomp>  X     
 
 
 +=U[*I*IoU[%+&&PUP[]b]hkn]nOo
 
 
r'   r)  r+  s     r%   r8   zAlbertConverter.vocab  %    
 

 
 
 	
r'   c                 f   t          j        dd          t          j        dd          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j                               |j	        j
        }|r'|                    t          j        |                     |                    t          j        t          d          d                     t          j        |          S Nz``"z''rW  r   r
   r\  r0   keep_accentsrJ   NFKDStripAccentsr   	LowercaserY  rZ  r]  r   r   r]   r  list_normalizersrZ  s       r%   r   zAlbertConverter.normalizer     c**c**
 &3 	@##K$4$6$6777##K$<$>$>???"0 	=##K$9$;$;<<<$4I 	S##K$;<P$Q$QRRR 3E'NNC H HIII#$4555r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S Nr  r  r  r  r   r   r   r0   r   r   s    r%   r   zAlbertConverter.post_processor  Y    ,)4$1GGPPQ$1GGPPQ
 
 
 	
r'   Nrk   rl   rm   r8   r   r   r6   r'   r%   rk  rk    A        
 
 
6 6 6&
 
 
 
 
r'   rk  c                       e Zd Zd Zd ZdS )BarthezConverterc                 
    d}|S Nr7  r6   r]   r  r-  s      r%   r-  zBarthezConverter.unk_id      r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s    r%   r   zBarthezConverter.post_processor  Y    , +/EEeLLM0FFvNNO
 
 
 	
r'   N)rk   rl   rm   r-  r   r6   r'   r%   r  r    s2          
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )CamembertConverterc                 R    g d}|d |j         dd          D             z  }|dgz  }|S )N))z
<s>NOTUSED        <pad>r  )z</s>NOTUSEDr  z<unk>r  )z<unk>NOTUSEDic                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z,CamembertConverter.vocab.<locals>.<listcomp>  !    KKK5;,KKKr'   r   z<mask>r  r)  r]   r  r8   s      r%   r8   zCamembertConverter.vocab  sK    
 
 
 	KK%,qrr:JKKKK/""r'   c                     dS r  r6   r+  s     r%   r-  zCamembertConverter.unk_id  s    qr'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    r%   r   z!CamembertConverter.post_processor  r  r'   Nrk   rl   rm   r8   r-  r   r6   r'   r%   r  r    sA            
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )DebertaV2Converterc                    g }| j         j        r(|                    t          j        d                     t          || j                   }|                    t          j        ||                     t          j        |          S )Nr   )r   rb  )r0   split_by_punctrJ   r   Punctuationr2   rd  r   )r]   rc  r(   list_pretokenizersr1   s        r%   r   z DebertaV2Converter.pre_tokenizer  s    "1 	W%%n&@*&U&U&UVVV,-=t?VWW!!.":{cq"r"r"rsss&'9:::r'   c                    g }| j         j        r&|                    t          j                               |                    t          j                               |j        j        }|r'|                    t          j        |                     |                    t          j	        t          d          d                     t          j        |          S )NrW  r   )r0   r   rJ   r
   rz  r[  rY  rZ  r]  r\  r   r   r{  s       r%   r   zDebertaV2Converter.normalizer  s    "0 	=##K$9$;$;<<< 1 3 3444$4I 	S##K$;<P$Q$QRRR 3E'NNC H HIII#$4555r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    r%   r   z!DebertaV2Converter.post_processor  r  r'   N)rk   rl   rm   r   r   r   r6   r'   r%   r  r    sA        ; ; ;6 6 6
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )MBartConverterc                 `    g d}|d |j         dd          D             z  }|g dz  }|dgz  }|S )Nr  r  r  r  r  r  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z(MBartConverter.vocab.<locals>.<listcomp>  r  r'   r7  )ar_ARr  cs_CZr  de_DEr  en_XXr  es_XXr  et_EEr  fi_FIr  fr_XXr  gu_INr  hi_INr  it_ITr  ja_XXr  kk_KZr  ko_KRr  lt_LTr  lv_LVr  my_MMr  ne_NPr  nl_XXr  ro_ROr  ru_RUr  si_LKr  tr_TRr  vi_VNr  zh_CNr  r  r)  r  s      r%   r8   zMBartConverter.vocab  sf    
 
 
 	KK%,qrr:JKKKK 
 
 
 	
6 	/""r'   c                     dS r  r6   r+  s     r%   r-  zMBartConverter.unk_id>      qr'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz$A </s> en_XXz$A $B </s> en_XXr  r  r   r  r   s    r%   r   zMBartConverter.post_processorA  Y    ,"#$1GGPPQ0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   r  r    sB        $ $ $L  
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )MBart50Converterc                 `    g d}|d |j         dd          D             z  }|g dz  }|dgz  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z*MBart50Converter.vocab.<locals>.<listcomp>T  r  r'   r7  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZAr  )az_AZr  )bn_INr  )fa_IRr  )he_ILr  )hr_HRr  )id_IDr  )ka_GEr  )km_KHr  )mk_MKr  )ml_INr  )mn_MNr  )mr_INr  )pl_PLr  )ps_AFr  )pt_XXr  )sv_SEr  )sw_KEr  )ta_INr  )te_INr  )th_THr  )tl_XXr  )uk_UAr  )ur_PKr  )xh_ZAr  )gl_ESr  )sl_SIr  r  r)  r  s      r%   r8   zMBart50Converter.vocabM  sh    
 
 
 	KK%,qrr:JKKKK  R  R  R  	R/""r'   c                     dS r  r6   r+  s     r%   r-  zMBart50Converter.unk_idY  r  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nzen_XX $A </s>zen_XX $A $B </s>r  r  r   r  r   s    r%   r   zMBart50Converter.post_processor\  r  r'   Nr  r6   r'   r%   r  r  L  sA        
 
 
  
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )NllbConverterc                 F    g d}|d |j         dd          D             z  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z'NllbConverter.vocab.<locals>.<listcomp>o  r  r'   r7  r)  r  s      r%   r8   zNllbConverter.vocabh  >    
 
 
 	KK%,qrr:JKKKKr'   c                     dS r  r6   r+  s     r%   r-  zNllbConverter.unk_idr  r  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s    r%   r   zNllbConverter.post_processoru  sY    ,%&T4JJ:VVW0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   r  r  g  sA            
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )SeamlessM4TConverterc                 F    g d}|d |j         dd          D             z  }|S )N)r  r  r  r  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>  r  r'   r7  r)  r  s      r%   r8   zSeamlessM4TConverter.vocab  r  r'   c                     | j         j        S rj   )r0   unk_token_idr+  s     r%   r-  zSeamlessM4TConverter.unk_id  s    &33r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s    r%   r   z#SeamlessM4TConverter.post_processor  sY    ,$%D3II)TTU0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   r  r    sA          4 4 4
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )XLMRobertaConverterc                 R    g d}|d |j         dd          D             z  }|dgz  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z-XLMRobertaConverter.vocab.<locals>.<listcomp>  r  r'   r7  r  r)  r  s      r%   r8   zXLMRobertaConverter.vocab  sK    
 
 
 	KK%,qrr:JKKKK/""r'   c                 
    d}|S r  r6   r  s      r%   r-  zXLMRobertaConverter.unk_id  r  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    r%   r   z"XLMRobertaConverter.post_processor  r  r'   Nr  r6   r'   r%   r  r    A        	 	 	  
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )XLNetConverterc                 $    d |j         D             S )Nc                 t    g | ]5}t          |j                  r|j        |j        fn|j        |j        d z
  f6S rn  rp  r(  s     r%   rF   z(XLNetConverter.vocab.<locals>.<listcomp>  rq  r'   r)  r+  s     r%   r8   zXLNetConverter.vocab  rr  r'   c                 f   t          j        dd          t          j        dd          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j                               |j	        j
        }|r'|                    t          j        |                     |                    t          j        t          d          d                     t          j        |          S rt  rv  r{  s       r%   r   zXLNetConverter.normalizer  r}  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s    r%   r   zXLNetConverter.post_processor  r  r'   Nr  r6   r'   r%   r  r    r  r'   r  c                       e Zd ZdS )ReformerConverterNrk   rl   rm   r6   r'   r%   r!  r!            Dr'   r!  c                       e Zd Zd Zd ZdS )RemBertConverterc                 >   t          j        dd          t          j        dd          t          j        t          d          d          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j	                               |j
        j        }|r'|                    t          j        |                     t          j        |          S rt  )r
   r\  r   r0   rw  rJ   rx  ry  r   rz  rY  rZ  r]  r   r{  s       r%   r   zRemBertConverter.normalizer  s    c**c**g44

 &3 	@##K$4$6$6777##K$<$>$>???"0 	=##K$9$;$;<<<$4I 	S##K$;<P$Q$QRRR#$4555r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    r%   r   zRemBertConverter.post_processor  r  r'   N)rk   rl   rm   r   r   r6   r'   r%   r%  r%    s2        6 6 6&
 
 
 
 
r'   r%  c                       e Zd ZdS )BertGenerationConverterNr"  r6   r'   r%   r)  r)    r#  r'   r)  c                   &    e Zd Zd Zd Zd Zd ZdS )PegasusConverterc                 p   | j         j        df| j         j        dfg}| j         j        || j         j        dfgz  }| j         j        ,| j         j        | j         j        k     r|| j         j        dfgz  }|d t          d| j         j                  D             z  }|d |j        dd          D             z  }|S )Nr  c                     g | ]
}d | ddfS )z<unk_>g      Yr6   rE   r2  s     r%   rF   z*PegasusConverter.vocab.<locals>.<listcomp>  s%    [[[Q<1<<<([[[r'   r=   c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z*PegasusConverter.vocab.<locals>.<listcomp>  r  r'   )	r0   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrI   r*  r  s      r%   r8   zPegasusConverter.vocab  s    $.4$.4

 "2>t.>DEEE #.:'58O8VVVt.93?@@E[[%4;R;Y2Z2Z[[[[KK%,qrr:JKKKKr'   c                 4    |j         j        | j        j        z   S rj   )r  r-  r0   r6  r+  s     r%   r-  zPegasusConverter.unk_id  s    !(4+B+IIIr'   c                     t          || j                  }t          j        t          j                    t          j        ||          g          S ra  )r2   r0   r   r   WhitespaceSplitrd  re  s       r%   r   zPegasusConverter.pre_tokenizer  sO    ,-=t?VWW&.00([Q_```
 
 	
r'   c                 p    | j         j        }|| j         j        fg}t          j        d|gdd|g|          S )N$A$Br   )r0   r2  eos_token_idr   r   )r]   eosr   s      r%   r   zPegasusConverter.post_processor!  sI    %/$)67
 ,T3KtTSVFWhvwwwwr'   N)rk   rl   rm   r8   r-  r   r   r6   r'   r%   r+  r+    sX          &J J J
 
 
x x x x xr'   r+  c                       e Zd Zd Zd ZdS )T5Converterc                     | j         j        }d |j        D             }|d t          |dz
  dd          D             z  }|S )Nc                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z%T5Converter.vocab.<locals>.<listcomp>,  s!    FFF%+u{+FFFr'   c                     g | ]
}d | ddfS )z
<extra_id_r.  r  r6   r/  s     r%   rF   z%T5Converter.vocab.<locals>.<listcomp>-  s)    UUUq$$$$c*UUUr'   r   r|   )r0   
_extra_idsr*  rI   )r]   r  num_extra_idsr8   s       r%   r8   zT5Converter.vocab*  sR    /:FFFFFUUE-!:KRQS4T4TUUUUr'   c                 n    t          j        ddgg dd| j                            d          fg          S Nr;  r  )r;  r  r<  r  r   r  r   s    r%   r   zT5Converter.post_processor0  J    ,&>---0FFvNNO
 
 
 	
r'   N)rk   rl   rm   r8   r   r6   r'   r%   r@  r@  )  s2          
 
 
 
 
r'   r@  c                       e Zd Zd ZdS )UdopConverterc                 n    t          j        ddgg dd| j                            d          fg          S rG  r  r   s    r%   r   zUdopConverter.post_processor;  rH  r'   Nrk   rl   rm   r   r6   r'   r%   rJ  rJ  :  s#        
 
 
 
 
r'   rJ  c                       e Zd ZdefdZdS )WhisperConverterr)   c           
      `   | j         j        }t          | j         j                                                  }t          t          ||d ddd                    }t          j        | j         j	                  |_
        t          j                    |_        | j         j        }| j                             |          }| j         j        }| j         j        }d                    d |D                       }t%          j        | d| d| d	| d
||fgt)          ||                    |_        |S )Nr   Fr   r   r   c                     g | ]}| d S )r   r6   rE   rF  s     r%   rF   z.WhisperConverter.converted.<locals>.<listcomp>\  s    #G#G#GUuLLL#G#G#Gr'   z $A:0 r   z $A:0 $B:1 r   r   )r0   r   rr   r   r   r   r   r   r   r(   r   r	   r   prefix_tokensconvert_ids_to_tokensr2  r=  joinr   r   zipr   )	r]   r8   rN   r   prefix_token_idsprefixesr>  r=  prefix_templates	            r%   r   zWhisperConverter.convertedF  sQ   '/d-7<<>>??*,#%  	
 	
	 #1":DLcLt"u"u"u	$.00	2@*@@AQRR%/.;((#G#Gh#G#G#GHH#-#@%44S444#77777l#X/00$
 $
 $
	  r'   Nr   r6   r'   r%   rN  rN  E  s/         9            r'   rN  c                       e Zd Zd ZdS )BigBirdConverterc           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    r%   r   zBigBirdConverter.post_processorj  r  r'   NrL  r6   r'   r%   rZ  rZ  i  s#        
 
 
 
 
r'   rZ  c                       e Zd ZdefdZdS )CLIPConverterr)   c                 
   | j         j        }t          | j         j                                                  }| j         j        }t          t          ||d dddt          |                              }t          j
        t          j                    t          j        t          d          d          t          j                    g          |_        t!          j
        t!          j        t          d          dd	
          t!          j        d          g          |_        t)          j                    |_        t-          j        | j         j        | j         j        f| j         j        | j         j        fdd          |_        |S )Nr   r   Fr8   rN   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r0   r   rr   r   r   r   r   r   ro   r
   r   r   r\  r   rz  r   r   r   r   r   r	   r   r   r   r2  r=  r   r   r   r   s        r%   r   zCLIPConverter.convertedv  ss   '/d-7<<>>??+5	*,#)i..  

 

	  +3_ 3E&MM3 G GI^I`I`a 
  
	 #1"9$Z[[&  
 (%@@@	#
 	#
	 %.00	 $.#?(2D4K4XY(2D4K4XY"	$
 $
 $
	  r'   Nr   r6   r'   r%   r]  r]  u  s/        '9 ' ' ' ' ' 'r'   r]  c                       e Zd ZdefdZdS )LayoutLMv2Converterr)   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )Nr   FTr   r   r   r   r   r   r   r   r   r   r   s
             r%   r   zLayoutLMv2Converter.converted  sy   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r'   Nr   r6   r'   r%   rb  rb    r   r'   rb  c                       e Zd ZdefdZdS )BlenderbotConverterr)   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        d|j         d|j        |j        fg          |_        |S )Nr   Fr   r   z$A:0 r   )r   r   )r0   r   rr   r   r   r   r   r   r   r(   r   r	   r   r   r   r2  r=  r   r   s        r%   r   zBlenderbotConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#@+2<+++r/$
 $
 $
	  r'   Nr   r6   r'   r%   re  re    r  r'   re  c                        e Zd Zd Zd Zd ZdS )XGLMConverterc                 T    g d}|d |j         dd          D             z  }|g dz  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z'XGLMConverter.vocab.<locals>.<listcomp>  r  r'   r7  ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  r)  r  s      r%   r8   zXGLMConverter.vocab  s[    
 
 
 	KK%,qrr:JKKKK  z  z  z  	zr'   c                 
    d}|S r  r6   r  s      r%   r-  zXGLMConverter.unk_id  r  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s    r%   r   zXGLMConverter.post_processor  sY    ,'/EEeLLM0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   rh  rh    r  r'   rh  c                   >    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zd	S )
GemmaConverterTz<start_of_turn>z<end_of_turn>c                 ,    t          j        dd          S Nr   rX  )r
   r\  r+  s     r%   r   zGemmaConverter.normalizer  s    "3...r'   c                    | j         j        df| j         j        df| j         j        dfg}|d |j        dd          D             z  }t          d |D                       s.t          d t          |          D             d           }|d||<   |S )Nr  c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z(GemmaConverter.vocab.<locals>.<listcomp>  r  r'   r7  c              3   .   K   | ]}|d          dk    V  dS )r   rw   Nr6   )rE   r7   s     r%   	<genexpr>z'GemmaConverter.vocab.<locals>.<genexpr>  s*      //A1Q44<//////r'   c              3   8   K   | ]\  }}|d          dk    |V  dS )r   rx   Nr6   )rE   r2  r7   s      r%   rt  z'GemmaConverter.vocab.<locals>.<genexpr>  s4      "V"VAQqTXEUEU1EUEUEUEU"V"Vr'   )rw   r  )r0   r1  r2  r   r*  anynextrN  )r]   r  r8   override_indexs       r%   r8   zGemmaConverter.vocab  s    $.4$.4$.4

 	KK%,qrr:JKKKK /////// 	4!"V"V51A1A"V"V"VX\]]N)(3n%r'   c                 ,    t          j        dd          S )Nr   merged_with_previous)r   r   r]   rc  r(   s      r%   r   zGemmaConverter.pre_tokenizer"  s    #C)?@@@r'   c                 
    d}|S r  r6   r  s      r%   r-  zGemmaConverter.unk_id%  r  r'   c                     t          j        t          j        dd          t          j                    t          j                    g          S )NrX  r   )r	   r   r\  ByteFallbackFuser{  s      r%   r   zGemmaConverter.decoder)  sA      ,,%''
 
 	
r'   N)rk   rl   rm   r  rt   rM  r   r   r8   r   r-  r   r6   r'   r%   rn  rn    s|        .L'9N/ / /   A A A  
 
 
 
 
r'   rn  c                   6    e Zd ZdZd Zd Zd Zd Zd Zd Z	dS )	LlamaConverterTc                     | j                             d          df| j                             d          df| j                             d          dfg}|d |j        dd          D             z  }|S )Nr   r  r   r=   c                 *    g | ]}|j         |j        fS r6   r&  r(  s     r%   rF   z(LlamaConverter.vocab.<locals>.<listcomp><  r  r'   r7  )r0   rS  r*  r  s      r%   r8   zLlamaConverter.vocab6  s    $::1==sC$::1==sC$::1==sC

 	KK%,qrr:JKKKKr'   c                 
    d}|S rI  r6   r  s      r%   r-  zLlamaConverter.unk_id?  r  r'   c                     t          j        dd          t          j                    t          j                    g}|r|t          j        dd          gz  }t          j        |          S NrX  r   r   )contentrV  r	   r\  r~  r  r[  r   r]   rc  r(   sequences       r%   r   zLlamaConverter.decoderC  e    UC((!##MOO

  	>!<<<==H ***r'   c                     t          | j        dd          r_g }t          | j        dd          r|t          j        d          gz  }|t          j        dd          gz  }t          j        |          S d S )Nr,   Tr(   rX  )prependr   )patternr  )r/   r0   r
   Prependr\  r   )r]   r  r  s      r%   r   zLlamaConverter.normalizerM  s    4*Hd;; 	2Ht.0BDII A[0???@@,S%HHHIIH'111tr'   c                     t          | j        dd          s,t          || j                  }t          j        ||d          S d S )Nr,   TFrc  r1   split)r/   r0   r2   r   rd  re  s       r%   r   zLlamaConverter.pre_tokenizerV  sL    t.$?? 	q01A4CZ[[N!+Tbjopppptr'   c                     d S rj   r6   r   s    r%   r   zLlamaConverter.post_processor\  s    tr'   N)
rk   rl   rm   r  r8   r-  r   r   r   r   r6   r'   r%   r  r  3  st            + + +        r'   r  c                       e Zd ZdefdZdS )MarkupLMConverterr)   c                 (   | j         }|j        }t          |j                                                  }t          t          ||d ddd| j         j                            }t          j	        |j
                  |_        t          j	                    |_        t          | j         j                  }t          | j         j                  }| j         j        }| j         j        }t'          j        | d| | d| d| ||f||fg          |_        |S )Nr   Fr_  r   z $A z $B r   )r0   r   rr   r   r   r   r   r   r   r   r(   r   r	   r   ro   r   r   r   r   r   r   r   )	r]   r   r8   rN   r   r   r   r   r   s	            r%   r   zMarkupLMConverter.convertedb  s1   $
bl''))***,#%1;  

 

	 #1":BL_"`"`"`	$.00	$)344$)344.;.;#-#@$$s$$++S++c++l#l#$
 $
 $
	  r'   Nr   r6   r'   r%   r  r  a  s/        "9 " " " " " "r'   r  c                   ,    e Zd ZdZddZd Zd Zd ZdS )MoshiConverterTNc                 L   t          | d           t                              | |           t                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _        d S Nr   r  	r   r   r^   r&   r  r  r  r  r  )r]   r  model_max_lengthkwargsr   r!  r"  s          r%   r^   zMoshiConverter.__init__  s    $
+++4,,, $%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


   (BBBc                     |j         j        }t          j        dd          g}|st          j        |          S t          j        t          j        |          g|z             S rp  )rY  rZ  r
   r\  r   r]  r^  s       r%   r   zMoshiConverter.normalizer  sg    $4IU++
 $ 	h'555')@AU)V)V(WZf(fgggr'   c                     t          j        dd          t          j                    t          j                    g}|r|t          j        dd          gz  }t          j        |          S r  r  r  s       r%   r   zMoshiConverter.decoder  r  r'   c                 4    d}t          j        ||d          S )Nr-   Fr  )r   rd  re  s       r%   r   zMoshiConverter.pre_tokenizer  s!     'KP^fkllllr'   rj   )rk   rl   rm   r  r^   r   r   r   r6   r'   r%   r  r    sc           h h h+ + +m m m m mr'   r  c                   D    e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 ZdS )HeliumConverterTNc                 L   t          | d           t                              | |           t                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _        d S r  r  )r]   r  r  r   r!  r"  s         r%   r^   zHeliumConverter.__init__  s    $
+++4,,,#%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


r  c                                           |          }t          t          |                     |           j                            } fdt          |j                  D             }|                    d t          |d           D                        |                    t          ddd          g           |
                    d	d
           |S )Nr/  c                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S r6  r9  r;  s      r%   rF   z-HeliumConverter.tokenizer.<locals>.<listcomp>  r>  r'   c                 <    g | ]\  }}}t          |d |d          S )FT)rB  rC  single_wordrD  rE  s       r%   rF   z-HeliumConverter.tokenizer.<locals>.<listcomp>  s@       &Bw 5UGQUVVV  r'   c                     | d         S rI  r6   rJ  s    r%   r9   z+HeliumConverter.tokenizer.<locals>.<lambda>  rK  r'   r:   
FrA  r  r7  )r1  pad_id)r8   r   r   r-  r  rN  r*  rQ  rK   r   enable_padding)r]   r  rM   r   rT  s   `    r%   r   zHeliumConverter.tokenizer  s   zz%(({{5))"7  
 
	
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 	j%OOOPQQQ  71 ===r'   c                 t    g }|j         D ]-}|j        dk    r|d|j        fgz  }||j        |j        fgz  }.|S )Nz<0x0A>r  )r*  rz   r'  )r]   r  r8   rz   s       r%   r8   zHeliumConverter.vocab  sX    \ 	6 	6E{h&&4-..5;455r'   c                 
    d}|S rI  r6   r  s      r%   r-  zHeliumConverter.unk_id  r  r'   c                     t          j        dd          t          j                    t          j                    g}|t          j        dd          gz  }t          j        |          S r  r  r  s       r%   r   zHeliumConverter.decoder  s]    UC((!##MOO

 	X^Ca88899 ***r'   c                 x    t          j        t          j        d          t          j        dd          g          S rp  )r
   r   r  r\  r+  s     r%   r   zHeliumConverter.normalizer  s2    #[%8%=%={?RSWY^?_?_$`aaar'   c                 R    t          j        t          j        dd          g          S )Nr  
contiguous)r   r   r   r{  s      r%   r   zHeliumConverter.pre_tokenizer  s#    &(<T<(P(P'QRRRr'   c                 :    t          j        ddgg ddg          S )Nr  r;  )r  r;  r  r<  )r  r   r   )r   r   r   s    r%   r   zHeliumConverter.post_processor  s@    ,   
 
 
 	
r'   rj   )rk   rl   rm   r  r^   r   r8   r-  r   r   r   r   r6   r'   r%   r  r    s        
 
 
 
  8    + + +b b bS S S
 
 
 
 
r'   r  c                        e Zd ZdZddZd ZdS )ParakeetConverterTNc                 Z   || _         t          | d           t                              | |           t	                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _	        d S r  )
r  r   r   r^   r&   r  r  r  r  r  )r]   r  r  r   r!  r"  s         r%   r^   zParakeetConverter.__init__  s    $$
+++4,,,#%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


s   %(BB Bc           
                                |          }                      j                                      |          \  }}d t	          |          D             }t          t          |||j        j        d j	        d                     } fdt	          |j
                  D             }|                    d t          |d           D                        |S )Nc                      i | ]\  }\  }}||S r6   r6   r1  s       r%   re   z/ParakeetConverter.tokenizer.<locals>.<dictcomp>  s#    MMM!1MT5T1MMMr'   Tr4  c                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S r6  r9  r;  s      r%   rF   z/ParakeetConverter.tokenizer.<locals>.<listcomp>*  r>  r'   c                 :    g | ]\  }}}t          |d |          S r@  rD  rE  s       r%   rF   z/ParakeetConverter.tokenizer.<locals>.<listcomp>0  rG  r'   c                     | d         S rI  r6   rJ  s    r%   r9   z-ParakeetConverter.tokenizer.<locals>.<lambda>2  rK  r'   r:   )r8   rM  r  ri   rN  r   r   r  rO  r  r*  rQ  rK   )r]   r  rM   rR  rN   rS  r   rT  s   `       r%   r   zParakeetConverter.tokenizer  s   zz%((%%do66>>|LL	6MMY|5L5LMMM	,6"7  	
 	
	
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 r'   rj   )rk   rl   rm   r  r^   r   r6   r'   r%   r  r  	  s=               r'   r  c            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S r6   )chr)rE   ns     r%   rF   z$bytes_to_unicode.<locals>.<listcomp>N  s    			Q#a&&			r'   )rr   rI   ordrJ   rG   rU  )bscsr  bs       r%   bytes_to_unicoder  :  s    	U3s88SXX\**++d5TCIIPQM3R3R.S.SSVZ[`adeiajajloptluluxyly[z[zV{V{{  
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr'   c                   @    e Zd ZdZ	 	 	 	 ddZdefdZd Zd	efd
Z	dS )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                     || _         || _        || _        t          |t                    r|                                n|| _        d S rj   r  r  r(   
isinstancerG   r   additional_special_tokensr]   r  r  r(   r  r  s         r%   r^   zTikTokenConverter.__init__W  R     % 0 3T::+%**,,,* 	&&&r'   tiktoken_urlc                 f   	 ddl m} n# t          $ r t          d          w xY w ||          t	                      fdg }i }                                D ]\  }}|| |          <   t          |          dk    r'g }t          dt          |                    D ]=}|d |         ||d          }
}	|	v r#|
v r|	|
z   v r|                    |	|
|f           >t          |fdd          }|
                    |           t          |d	 d          }fd
|D             }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                 l    d                     fd|                     d          D                       S )Nr   c                 :    g | ]}t          |                   S r6   r  rE   charbyte_encoders     r%   rF   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>t  $    TTTLT3TTTr'   latin-1rT  decoder  r  s    r%   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_strings  s6    77TTTT@S@STTTUUUr'   r   c                 <    | d                  | d                  fS r5   r6   r7   r   s    r%   r9   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    1Q4)AaD/0R r'   FrA   c                     | d         S Nr=   r6   r?   s    r%   r9   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  
    A r'   c                 T    g | ]$} |d                     |d                   f%S rD   r6   rE   r@   r  s     r%   rF   zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  ?    cccUX((Q002G2GA2O2OPcccr'   )tiktoken.loadr  rP  
ValueErrorr  rH   r>   rI   rJ   rK   rL   )r]   r  r  rN   r8   rF  rankrQ   rR   rS   rT   r   r  r  s              @@@r%   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_modelh  s   	7777777 	 	 	k  	
 &%l33	'))	V 	V 	V 	V 	V $??,, 
	! 
	!KE426E''../5zzQEq#e**-- ; ;#(%=%-i''Gy,@,@gPWFW\eEeEeLL'7D!9:::5&R&R&R&R\abbbEMM%    $6$6FFFcccc\bcccf}s    &c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S NF)r   ignore_mergesTr  r  r   r   r   rX   r  r]   rM   rN   r   s       r%   r   zTikTokenConverter.tokenizer  \    #CCDOTTfc,GGGHH	9?O44 	1,0IO)r'   r)   c                    |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        |                    d | j        D                        t          j        d          |_        |S )Nr   Fr   r   c                 2    g | ]}t          |d d          S )FTrA  rD  rQ  s     r%   rF   z/TikTokenConverter.converted.<locals>.<listcomp>  s'    kkk5Z%>>>kkkr'   r   )r   r   r   r   r   r  r   r(   r   r	   r   r   r  r   r   r]   r   s     r%   r   zTikTokenConverter.converted  s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	$$kkDLjkkk	
 	
 	
 $.#7U#K#K#K	 r'   Nr  FN)
rk   rl   rm   rn   r^   ro   r  r   r   r   r6   r'   r%   r  r  R  s           K"&
 
 
 
"C    >  9      r'   r  c                   <    e Zd Z	 	 	 	 d
dZdefdZd Zdefd	ZdS )MistralConverterNr  Fc                     || _         || _        || _        t          |t                    r|                                n|| _        d S rj   r  r  s         r%   r^   zMistralConverter.__init__  r  r'   r  c                    dd l dd l}t          | j        dd          5 }|                    |          }d d d            n# 1 swxY w Y   |d         d         | _        d |d         D             | _        |d	         t                      t          fd
            g }i }t          | j                  D ]\  }}|||j
        <   fdD             t                    }	t          t          d                    D ]\  }
}|
| |          <   t          |          dk    r'g }t          dt          |                    D ]=}|d |         ||d          }}||	v r#||	v r||z   |	v r|                    |||
f           >t!          |fdd          }|                    |           t!          |d d          }fd|D             }||fS )Nr   rzutf-8)encodingconfigr  c                 H    g | ]}t          |d          |d                    S )	token_str
is_control)rC  rD  )rE   ks     r%   rF   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  s:     *
 *
 *
DEJq~q???*
 *
 *
r'   r   r8   c                 l    d                     fd|                     d          D                       S )Nr   c                 :    g | ]}t          |                   S r6   r  r  s     r%   rF   zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>  r  r'   r  r  r  s    r%   r  zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s6    77TTTT@S@STTTUUUr'   c                 F    g | ]}                     |d                    S )token_bytes)	b64decode)rE   r  base64s     r%   rF   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  s,    KKKAV%%a&677KKKr'   z(Converting tekken.json to tokenizer.json)descr   c                 p                         | d                                        | d                   fS r5   )rR   r  s    r%   r9   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s/    11F1F	XYZ[X\H]H]0^ r'   FrA   c                     | d         S r  r6   r?   s    r%   r9   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  r  r'   c                 T    g | ]$} |d                     |d                   f%S rD   r6   r  s     r%   rF   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  r  r'   )r
  jsonr  r  loadr  r  r  r   rN  r  setr   r>   rI   rJ   rK   rL   )r]   r  r  r"  untypedrN   r8   idxrF  rank_setr  rQ   rR   rS   rT   r
  r   r  r  s                  @@@@r%   r  z0MistralConverter.extract_vocab_merges_from_model  s   $/3999 	#QiillG	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	#x(3*
 *
IPQaIb*
 *
 *
& G$	'))		V 	V 	V 	V 
	V #D$BCC 	' 	'JC#&E%-  KKKKKKK	y>>$T):d%e%e%eff 
	! 
	!KD%26E''../5zzQEq#e**-- ; ;#(%=%-h&&7h+>+>GgDUZbCbCbLL'7D!9:::5&^&^&^&^hmnnnEMM%    $6$6FFFcccc\bcccf}s   AA
A
c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S r  r  r  s       r%   r   zMistralConverter.tokenizer  r  r'   r)   c                 |   |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        |                    | j                   t          j        d          |_        |S )Nr   Fr   r   r   )r   r   r   r   r   r  r   r(   r   r	   r   rQ  r  r   r   r  s     r%   r   zMistralConverter.converted  s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	T;<<<#-#7U#K#K#K	 r'   r  )	rk   rl   rm   r^   ro   r  r   r   r   r6   r'   r%   r  r    s          K"&
 
 
 
"$C $ $ $ $L  9      r'   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                 ^   | j         j        }|t          v r,|s*t          |         } ||                                           S | j                            d          rG| | _        t                              d           t          | j                                                  S 	 t                              d           t          | j        | j                                                  S # t          $ r7 t          dt          t                                                               w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)r  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r#  rk   SLOW_TO_FAST_CONVERTERSr   r  endswithr0   loggerinfor  r  r  rP  r  rr   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r%   convert_slow_tokenizerrY  -  s;     1:C666}612FG455??AAA		)	2	2=	A	A 3H09::: 5 @AAKKMMM	KK2333$0;*?*Y   ikk  	 	 	e>BCZC_C_CaCa>b>be e  	s   $AC+ +AD,)r   )F)Urn   r  	functoolsr   typingr   	packagingr   
tokenizersr   r   r   r	   r
   r   r   tokenizers.modelsr   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerrk   rS  r&   boolro   r2   rU   rW   rt   r   r   r   r   r   r   r   r   r   r   r   r  r
  r  rk  r  r  r  r  r  r  r  r  r  r!  r%  r)  r+  r@  rJ  rN  rZ  r]  rb  re  rh  rn  r  r  r  r  r  r  r  r  rQ  rY  r6   r'   r%   <module>rc     s0                       f f f f f f f f f f f f f f f f f f 5 5 5 5 5 5 5 5 5 5       ` ` ` ` ` ` ` ` ` ` ` ` 5 5 5 5 5 5 
	H	%	%G G G G"$ s      &       2    "8   "Ic Id I I I I$ $ $ $ $ $ $ $$ $ $ $ $I $ $ $N/ / / / /	 / / /d$ $ $ $ $i $ $ $N$ $ $ $ $Y $ $ $N       6% % % % %I % % %P    y   >+ + + + +Y + + +\    y   :$ $ $ $ $	 $ $ $N    y   >~ ~ ~ ~ ~9 ~ ~ ~B"
 "
 "
 "
 "
l "
 "
 "
J
 
 
 
 
| 
 
 
 
 
 
 
 
 
 
 
:
 
 
 
 
 
 
 
B2
 2
 2
 2
 2
\ 2
 2
 2
j
 
 
 
 
| 
 
 
6
 
 
 
 
L 
 
 
2
 
 
 
 
< 
 
 
2
 
 
 
 
, 
 
 
6"
 "
 "
 "
 "
\ "
 "
 "
J	 	 	 	 	 	 	 	
 
 
 
 
| 
 
 
@	 	 	 	 	l 	 	 	%x %x %x %x %x| %x %x %xP
 
 
 
 
, 
 
 
"
 
 
 
 
L 
 
 
! ! ! ! !y ! ! !H	
 	
 	
 	
 	
| 	
 	
 	
( ( ( ( (I ( ( (V$ $ $ $ $) $ $ $N    )   :
 
 
 
 
L 
 
 
61
 1
 1
 1
 1
\ 1
 1
 1
h+ + + + +\ + + +\# # # # #	 # # #L&m &m &m &m &m\ &m &m &mRV
 V
 V
 V
 V
l V
 V
 V
r- - - - - - - -b  0L L L L L L L L^L L L L L L L L^::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #: :$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E: : :F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng: :h +"$($#s: : : z$ $) $ $ $ $ $ $r'   