
     `i&                         d Z ddlZddlZddlmZ ddlmZmZ  e            rddlZddl	m
Z
 ddlmZ  ej        e          Zdd	iZd
 Z G d d          Z G d de
          ZdgZdS )z Tokenization classes for CPMAnt.    N)Optional)is_rjieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                    t          j                    }t          | dd          5 }|                                }ddd           n# 1 swxY w Y   t	          |          D ]\  }}|                    d          }|||<    |S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r	   vocabreadertokensindextokens         /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   $   s    #%%E	j#	0	0	0 $F!!##$ $ $ $ $ $ $ $ $ $ $ $ $ $ $!&))  uT""eLs   AAAc                       e Zd ZddZd ZdS )WordpieceTokenizer<unk>   c                 0    || _         || _        || _        d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r#   r$   s       r   __init__zWordpieceTokenizer.__init__0   s    
"(@%%%    c                    t          |          }t          |          | j        k    r| j        gS d}g }|t          |          k     rt          |          }d }||k     r4d                    |||                   }|| j        v r|}n|dz  }||k     4| |                    | j                   |dz  }n|                    |           |}|t          |          k     |S )Nr       )listlenr$   r#   joinr   append)r%   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizezWordpieceTokenizer.tokenize5   s    Uu::555N##
c%jj  e**CJ#++uSy!122TZ''!'Jq #++ !!!$.111
!!*--- c%jj    r'   N)r   r    )__name__
__module____qualname__r&   r5    r'   r   r   r   /   s;        A A A A
    r'   r   c            
           e Zd ZdZeZddgZdZ	 	 	 	 	 	 	 	 	 d% fd	Ze	d             Z
e	d             Ze	d             Ze	defd            Zd Zd Z fdZd Zdee         defdZd Zd Zd&dedee         dee         fdZ	 d&d ee         d!eee                  dee         fd"Z	 d'd ee         d!eee                  d#edee         f fd$Z xZS )(CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskF<d></d><s></s><pad>r   </n></_>leftc                 .   t          | dg           || _        || _        t          |          | _        | j        |	         | j        d<   | j        |         | j        d<   | j        |	= | j        |= t          j        t          | j                                        d                     | _        d | j                                        D             | _	        t          | j        |          | _         t                      j        d	||||||||	|
d	| d S )
Nrjieba r   c                     | d         S Nr*   r9   xs    r   <lambda>z*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^ r'   keyc                     i | ]\  }}||	S r9   r9   ).0kvs      r   
<dictcomp>z,CpmAntTokenizer.__init__.<locals>.<dictcomp>   s    >>>A1>>>r'   )r   r#   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr#   
line_tokenspace_tokenpadding_sider9   )r   rV   rW   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr&   )r%   r	   rV   rW   rX   rY   rZ   r#   r[   r\   r]   kwargs	__class__s               r   r&   zCpmAntTokenizer.__init__l   s.    	$
+++""!*-- L5S!\*5TL%L$".vdl6H6H6J6JP^P^/_/_/_``>>););)=)=>>>#5DLT]#^#^#^  	
!#%	
 	
 	
 	
 	
 	
 	
r'   c                 &    | j         | j                 S r"   )r^   rV   r%   s    r   bod_token_idzCpmAntTokenizer.bod_token_id       |DN++r'   c                 &    | j         | j                 S r"   )r^   rW   rg   s    r   eod_token_idzCpmAntTokenizer.eod_token_id   ri   r'   c                     | j         d         S )Nr   r^   rg   s    r   
newline_idzCpmAntTokenizer.newline_id   s    |D!!r'   returnc                 *    t          | j                  S r"   )r,   r^   rg   s    r   
vocab_sizezCpmAntTokenizer.vocab_size   s    4<   r'   c                 0    t          | j        fi | j        S r"   )dictr^   added_tokens_encoderrg   s    r   	get_vocabzCpmAntTokenizer.get_vocab   s    DL>>D$=>>>r'   c                     g }t          j        |d          D ]/}|                    | j                            |                     0|S )zTokenize a string.F)rG   cutextendrb   r5   )r%   textoutput_tokensrL   s       r   	_tokenizezCpmAntTokenizer._tokenize   sR    D%(( 	G 	GA  !9!B!B1!E!EFFFFr'   c                 n     d |D             } fd|D             } t                      j        |fi |S )zDecode ids into a string.c                     g | ]
}|d k    |S )r   r9   )rR   is     r   
<listcomp>z+CpmAntTokenizer._decode.<locals>.<listcomp>   s    4441Q!VVQVVVr'   c                 V    g | ]%}|j         k    |j        k    |j        k    #|&S r9   )pad_token_ideos_token_idbos_token_id)rR   rL   r%   s     r   r   z+CpmAntTokenizer._decode.<locals>.<listcomp>   sI     
 
 
A):$:$:qDDU?U?UZ[_c_pZpZpAZpZpZpr'   )rc   _decode)r%   	token_idsrd   re   s   `  r   r   zCpmAntTokenizer._decode   s_    44	444	
 
 
 
 
 
 
	 uwwy33F333r'   c                     || j         v S r"   rm   r%   r   s     r   checkzCpmAntTokenizer.check   s    $$r'   r   c                 ,    d                     |          S )Nr)   )r-   )r%   r   s     r   convert_tokens_to_stringz(CpmAntTokenizer.convert_tokens_to_string   s    wwvr'   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r^   getr#   r   s     r   _convert_token_to_idz$CpmAntTokenizer._convert_token_to_id   s,    |t|'7'7'G'GHHHr'   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)ra   r   r#   )r%   r   s     r   _convert_id_to_tokenz$CpmAntTokenizer._convert_id_to_token   s    |t~666r'   Nsave_directoryfilename_prefixc                     t           j                            |          r6t           j                            ||r|dz   ndt          d         z             }n|r|dz   nd|z   }d}d| j        v r| j        d         | j        d<   | j        d= d| j        v r| j        d         | j        d<   | j        d= t          j        t          | j        	                                d	 
                    | _        t          |dd          5 }| j        	                                D ]H\  }}||k    r t                              d| d           |}|                    |dz              |dz  }I	 d d d            n# 1 swxY w Y   |fS )N-r)   r	   r   rH   rD   r   rC   c                     | d         S rJ   r9   rK   s    r   rM   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rN   r'   rO   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r*   )ospathisdirr-   VOCAB_FILES_NAMESr^   r   r   r_   r`   r   loggerwarningwrite)r%   r   r   r	   r   writerr   token_indexs           r   save_vocabularyzCpmAntTokenizer.save_vocabulary   s   7==(( 	]/!Q3!6!6rUfgsUt t JJ 4CJ/C//n\J$,#'<#4DL S!4<#'<#5DL T"".vdl6H6H6J6JP^P^/_/_/_``*cG444 		&*l&8&8&:&:  "{K''NNN
 N N N   (EUT\***
		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 }s   A#E22E69E6token_ids_0token_ids_1c                 J    || j         g|z   S | j         g|z   | j         gz   |z   S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `list[int]`: The model input with special tokens.
        )r   )r%   r   r   s      r    build_inputs_with_special_tokensz0CpmAntTokenizer.build_inputs_with_special_tokens   s<      %&44!"[0D4E3FFTTr'   already_has_special_tokensc                     |r$t                                          ||d          S |/dgdgt          |          z  z   dgz   dgt          |          z  z   S dgdgt          |          z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`): List of IDs.
            token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr*   r   )rc   get_special_tokens_maskr,   )r%   r   r   r   re   s       r   r   z'CpmAntTokenizer.get_special_tokens_mask   s    " & 	7722'[]a 3    "31#K 0 001QC7A3[AQAQ;QRRsqcC,,,--r'   )	r>   r?   r@   rA   rB   r   rC   rD   rE   r"   )NF)r6   r7   r8   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer&   propertyrh   rk   rn   intrq   ru   r{   r   r   r+   strr   r   r   r   tupler   r   boolr   __classcell__)re   s   @r   r;   r;   O   si        0 *$&67
 (
 (
 (
 (
 (
 (
T , , X, , , X, " " X" !C ! ! ! X!? ? ?  4 4 4 4 4% % %tCy S    I I I7 7 7 c HSM ]bcf]g    8 JNU U9U3;DI3FU	cU U U U* sx. .9.3;DI3F.ko.	c. . . . . . . . . .r'   r;   )r   r   r   typingr   transformers.utilsr   r   rG   tokenization_utilsr   utilsr   
get_loggerr6   r   r   r   r   r;   __all__r9   r'   r   <module>r      s%   ' &     				       E E E E E E E E  MMM 5 5 5 5 5 5       
	H	%	%!;/          @~. ~. ~. ~. ~.) ~. ~. ~.B 
r'   