
     `i                     t    d Z ddlmZ ddlmZmZ ddlmZ  ej        e	          Z
 G d de          ZdgZdS )	zTokenization class for Dia.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingc            	            e Zd ZdZddgZ	 	 	 	 ddee         dee         d	ee         d
ef fdZe	d             Z
d Zdedee         fdZd Zd Zdee         defdZddedee         dee         fdZ xZS )DiaTokenizera  
    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<pad>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
        offset (`int`, *optional*, defaults to 0):
            The offset of the tokenizer.
    	input_idsattention_mask<pad>   r   	pad_token	unk_token
max_lengthoffsetc                 @   t          |t                    rt          |          n|}t          |t                    rt          |          n|}d| _        |t          d          t          d          d| _        || _         t                      j        d|||d| d S )N   z[S1]z[S2])r         )r   r   r    )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr   r   r   r   kwargs	__class__s         |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/dia/tokenization_dia.pyr   zDiaTokenizer.__init__/   s     .8	3-G-GVJy)))Y	-7	3-G-GVJy)))Y	#)2z&7I7IjY_N`N`%a%a" 	
!	
 	
 		
 	
 	
 	
 	
    c                     | j         S N)r   )r   s    r    
vocab_sizezDiaTokenizer.vocab_sizeE   s    ##r!   c                       fdt           j         j        z             D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r   )convert_ids_to_tokens).0ir   s     r    
<dictcomp>z*DiaTokenizer.get_vocab.<locals>.<dictcomp>J   s)    ```a++A..```r!   )ranger$   r   updateadded_tokens_encoder)r   vocabs   ` r    	get_vocabzDiaTokenizer.get_vocabI   sI    ````5SWS^A^;_;_```T.///r!   textreturnc                 D    d |                     d          D             }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 ,    g | ]}t          |          S r   )chr)r(   r)   s     r    
<listcomp>z*DiaTokenizer._tokenize.<locals>.<listcomp>P   s    777Q#a&&777r!   utf-8)encode)r   r0   tokenss      r    	_tokenizezDiaTokenizer._tokenizeN   s&    77$++g"6"6777r!   c                 `    t          |          dk    rd}nt          |          | j        z   }|S )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokentoken_ids      r    _convert_token_to_idz!DiaTokenizer._convert_token_to_idS   s1     u::??HH5zzDK/Hr!   c                 4    t          || j        z
            }|S )z=Converts an index (integer) in a token (str) using the vocab.)r4   r   )r   indexr=   s      r    _convert_id_to_tokenz!DiaTokenizer._convert_id_to_token]   s    EDK'((r!   r8   c                 &   d}|D ]t}|| j         v r0| j         |         }t          |                              d          }n4|| j        v r|                    d          }n|                    d          }||z  }u|                    dd          }|S )z:Converts a sequence of tokens (string) in a single string.r!   r6   ignore)errors)added_tokens_decoderr   r7   r-   decode)r   r8   bstringr=   added_token_obj
tok_stringstrings          r    convert_tokens_to_stringz%DiaTokenizer.convert_tokens_to_stringb   s     	" 	"E111"&";E"B 1188AA

$333"\\'22

"\\'22
z!GG99r!   Nsave_directoryfilename_prefixc                     dS )Nr   r   )r   rM   rN   s      r    save_vocabularyzDiaTokenizer.save_vocabularyr   s    rr!   )r   r   r   r   r#   )__name__
__module____qualname____doc__model_input_namesr   r   intr   propertyr$   r/   listr9   r?   rB   rL   tuplerP   __classcell__)r   s   @r    r	   r	      sg        $ %&67 $+#*$(
 
C=
 C=
 SM	

 
 
 
 
 
 
, $ $ X$  
c d3i    
    
tCy S      c HSM ]bcf]g        r!   r	   N)rT   typingr   tokenization_utilsr   r   utilsr   
get_loggerrQ   loggerr	   __all__r   r!   r    <module>ra      s    " !       A A A A A A A A       
	H	%	%Y Y Y Y Y& Y Y Yx 
r!   