
     `i                         d Z ddlZddlmZ ddlmZ ddlmZ  ej        e	          Z
ddiZd	 Z G d
 de          ZdgZdS )zTokenization classes for ESM.    N)Optional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     t          | d          5 }|                                                                }d |D             cd d d            S # 1 swxY w Y   d S )Nrc                 6    g | ]}|                                 S  )strip).0ls     |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>z#load_vocab_file.<locals>.<listcomp>    s     )))a		)))    )openread
splitlines)r   fliness      r   load_vocab_filer      s    	j#		 *!##%%))5)))* * * * * * * * * * * * * * * * * *s   2AAAc            
       $    e Zd ZdZeZddgZ	 	 	 	 	 d fd		Zd
ede	fdZ
de	defdZd Zd Zde	defdZd
ede	fdZ	 ddee         deee                  dee         fdZ	 ddedee         dedee         fdZd Zedefd            Z xZS )EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_mask<unk><cls><pad><mask><eos>c           	      L   t          |          | _        t          t          | j                            | _        d t          | j                  D             | _         t                      j        d|||||d| | j        | _        | 	                    | j                   d S )Nc                     i | ]\  }}||	S r   r   )r   indtoks      r   
<dictcomp>z)EsmTokenizer.__init__.<locals>.<dictcomp>7   s    QQQ(#sS#QQQr   )	unk_token	cls_token	pad_token
mask_token	eos_tokenr   )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)	selfr   r'   r(   r)   r*   r+   kwargs	__class__s	           r   r2   zEsmTokenizer.__init__+   s     **55 4?!;!;<<QQi6P6PQQQ 	
!	
 	
 	
 	
 	
 '+o#$566666r   indexreturnc                 B    | j                             || j                  S Nr/   getr'   r5   r8   s     r   _convert_id_to_tokenz!EsmTokenizer._convert_id_to_tokenG        $$UDN;;;r   tokenc                 r    | j                             || j                             | j                            S r;   r0   r=   r'   r5   rA   s     r   _convert_token_to_idz!EsmTokenizer._convert_token_to_idJ   .     $$UD,=,A,A$.,Q,QRRRr   c                 *    |                                 S r;   )split)r5   textr6   s      r   	_tokenizezEsmTokenizer._tokenizeM   s    zz||r   c                 l    | j                                         }|                    | j                   |S r;   )r0   copyupdateadded_tokens_encoder)r5   
base_vocabs     r   	get_vocabzEsmTokenizer.get_vocabP   s3    &++--
$3444r   c                 r    | j                             || j                             | j                            S r;   rC   rD   s     r   token_to_idzEsmTokenizer.token_to_idU   rF   r   c                 B    | j                             || j                  S r;   r<   r>   s     r   id_to_tokenzEsmTokenizer.id_to_tokenX   r@   r   Ntoken_ids_0token_ids_1c                     | j         g}| j        g}|| j        ||z   S ||z   |z   S | j        t          d          ||z   |z   |z   |z   S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r5   rU   rV   clsseps        r    build_inputs_with_special_tokensz-EsmTokenizer.build_inputs_with_special_tokens[   sv      ! ! ([(([(3..&\]]][ 3&4s::r   Falready_has_special_tokensc                      |r|t          d           fd|D             S dgdgt          |          z  z   dgz   }||dgt          |          z  dgz   z  }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                 *    g | ]}|j         v rd ndS )   r   )all_special_ids)r   rA   r5   s     r   r   z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>   s*    WWW%$"666AAAWWWr   ra   r   )rZ   len)r5   rU   rV   r^   masks   `    r   get_special_tokens_maskz$EsmTokenizer.get_special_tokens_maski   s    $ & 	X& R  
 XWWW;WWWWsqcC,,,-3"QC#k***aS00Dr   c                    t           j                            ||r|dz   nddz             }t          |d          5 }|                    d                    | j                             d d d            n# 1 swxY w Y   |fS )N- r   w
)ospathjoinr   writer,   )r5   save_directoryfilename_prefixr   r   s        r   save_vocabularyzEsmTokenizer.save_vocabulary   s    W\\.O3c?S3H3Hacgr2rss
*c"" 	0aGGDIIdo..///	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0}s   .A55A9<A9c                 *    t          | j                  S r;   )rc   r,   )r5   s    r   
vocab_sizezEsmTokenizer.vocab_size   s    4?###r   )r   r   r   r    r!   r;   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr2   intstrr?   rE   rJ   rP   rR   rT   listr   r]   boolre   rq   propertyrs   __classcell__)r7   s   @r   r   r   #   s         *$&67
 7 7 7 7 7 78<# <# < < < <S# S# S S S S    
S S S S S S< < < < < < JN; ;9;3;DI3F;	c; ; ; ; in .6tnae	c   >   $C $ $ $ X$ $ $ $ $r   r   )rw   rk   typingr   tokenization_utilsr   utilsr   
get_loggerrt   loggerrx   r   r   __all__r   r   r   <module>r      s    $ # 				       5 5 5 5 5 5       
	H	%	%!;/ * * *m$ m$ m$ m$ m$& m$ m$ m$` 
r   