
     `iA
                     8    d Z ddlmZmZmZ  G d d          ZdS )z Tokenization utils for RoFormer.    )NormalizedStringPreTokenizedStringnormalizersc                   B    e Zd Zd	dZdededee         fdZdefdZ	dS )
JiebaPreTokenizerreturnNc                     || _         t          j        dddd          | _        	 dd l}n# t          $ r t	          d          w xY w|| _        d S )NFT)
clean_texthandle_chinese_charsstrip_accents	lowercaser   zkYou need to install rjieba to use RoFormerTokenizer. See https://pypi.org/project/rjieba/ for installation.)vocabr   BertNormalizerrjiebaImportErrorjieba)selfr   r   s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/roformer/tokenization_utils.py__init__zJiebaPreTokenizer.__init__   s    
&5!%	
 
 
	MMMM 	 	 	I  	
 


s	   + Ainormalized_stringc                    g }| j                             t          |          d          D ]\  }}}|| j        v r|                    |||                    -| j                            |                                          }|D ]5}|r1|t          |          z   }|                    |||                    |}6|S )NF)hmm)	r   tokenizestrr   appendr   normalize_strsplitlen)r   r   r   splitstokenstartend
token_lists           r   jieba_splitzJiebaPreTokenizer.jieba_split&   s     "&!4!4S9J5K5KQV!4!W!W 		$ 		$E5#
""/c	:;;;;!-;;EBBHHJJ
' $ $E $#c%jj0&7c	&BCCC #	$      pretokc                 :    |                     | j                   d S )N)r   r%   )r   r'   s     r   pre_tokenizezJiebaPreTokenizer.pre_tokenizeA   s    T%&&&&&r&   )r   N)
__name__
__module____qualname__r   intr   listr%   r   r)    r&   r   r   r      st           "S 5E $O_J`    6'#5 ' ' ' ' ' 'r&   r   N)__doc__
tokenizersr   r   r   r   r/   r&   r   <module>r2      s]    ' & H H H H H H H H H H.' .' .' .' .' .' .' .' .' .'r&   