
     `ib                     t    d Z ddlmZ ddlmZmZ ddlmZ  ej        e	          Z
 G d de          ZdgZdS )	z!Tokenization class for Perceiver.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingc            
       d    e Zd ZdZddgZ	 	 	 	 	 	 	 d	 d fdZdeeef         fdZ	e
d             Z	 d dee         deee                  dedee         f fdZ	 d!dee         deee                  dee         fdZdedee         fdZd Zd Zd Zd!dedee         dee         fdZ xZS )"PerceiverTokenizeraS  
    Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
            The BOS token (reserved in the vocab, but not actually used).
        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
            The end of sequence token (reserved in the vocab, but not actually used).

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The MASK token, useful for masked language modeling.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The CLS token (reserved in the vocab, but not actually used).
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from two sequences.

    	input_idsattention_mask[PAD][BOS][EOS][MASK][CLS][SEP]   returnNc                    t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}d| _        ||||||d| _        t          | j                  | _         t                      j        d|||||||d| d S )NF)lstriprstrip   )r         r         )	pad_token	bos_token	eos_token
mask_token	cls_token	sep_tokenmodel_max_length )	
isinstancestrr   _utf_vocab_size_added_tokens_decoderlen_num_special_tokenssuper__init__)
selfr   r   r   r   r    r!   r"   kwargs	__class__s
            /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/perceiver/tokenization_perceiver.pyr+   zPerceiverTokenizer.__init__;   s    JTT]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	KUV`beKfKfvZ
5GGGGlv
IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	# 6
 6
" $'t'A#B#B  		
!-		
 		
 		
 		
 		
 		
 		
    c                     i }t          | j                  D ]}t          |          }|| j        z   ||<   |                    | j                   |S N)ranger&   chrr)   updateadded_tokens_encoder)r,   vocabitokens       r/   	get_vocabzPerceiverTokenizer.get_vocabd   sY    t+,, 	8 	8AFFEt77E%LLT.///r0   c                     | j         S r2   )r&   )r,   s    r/   
vocab_sizezPerceiverTokenizer.vocab_sizel   s    ##r0   Ftoken_ids_0token_ids_1already_has_special_tokensc                     |r$t                                          ||d          S |dgdgt          |          z  z   dgz   S dgdgt          |          z  z   dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r=   r>   r?   Nr   r   )r*   get_special_tokens_maskr(   )r,   r=   r>   r?   r.   s       r/   rA   z*PerceiverTokenizer.get_special_tokens_maskp   s    $ & 	7722'[]a 3   
 3!s;////1#55sqcC,,,-3sS=M=M7MNRSQTTTr0   c                 n    || j         g|z   | j        gz   S | j         g|z   | j        gz   |z   | j        gz   S )af  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
        following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)r,   r=   r>   s      r/    build_inputs_with_special_tokensz3PerceiverTokenizer.build_inputs_with_special_tokens   sS    & %&48I7JJJ%&48I7JJ[X\`\m[nnnr0   textc                 D    d |                     d          D             }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 ,    g | ]}t          |          S r#   )r4   ).0r8   s     r/   
<listcomp>z0PerceiverTokenizer._tokenize.<locals>.<listcomp>   s    777Q#a&&777r0   utf-8)encode)r,   rF   tokenss      r/   	_tokenizezPerceiverTokenizer._tokenize   s&    77$++g"6"6777r0   c                 j    t          |          dk    r| j        }nt          |          | j        z   }|S )z0Converts a token (str) in an id using the vocab.r   )r(   unk_token_idordr)   )r,   r9   token_ids      r/   _convert_token_to_idz'PerceiverTokenizer._convert_token_to_id   s3    u::??(HH5zzD$<<Hr0   c                 4    t          || j        z
            }|S )z=Converts an index (integer) in a token (str) using the vocab.)r4   r)   )r,   indexr9   s      r/   _convert_id_to_tokenz'PerceiverTokenizer._convert_id_to_token   s    ED4455r0   c                     d}|D ]P}|| j         v r#t          |                              d          }nt          t	          |          g          }||z  }Q|                    dd          }|S )z:Converts a sequence of tokens (string) in a single string.r0   rK   replace)errors)r6   r%   rL   bytesrQ   decode)r,   rM   bstringr9   
tok_stringstrings         r/   convert_tokens_to_stringz+PerceiverTokenizer.convert_tokens_to_string   sz     	" 	"E111 ZZ..w77

"CJJ<00
z!GG	::r0   save_directoryfilename_prefixc                     dS )Nr#   r#   )r,   r`   ra   s      r/   save_vocabularyz"PerceiverTokenizer.save_vocabulary   s    rr0   )r   r   r   r   r   r   r   )r   N)NFr2   )__name__
__module____qualname____doc__model_input_namesr+   dictr%   intr:   propertyr<   listr   boolrA   rE   rN   rS   rV   r_   tuplerc   __classcell__)r.   s   @r/   r	   r	      s        < %&67 '
 
'
 '
 '
 '
 '
 '
R4S>     $ $ X$ sxU U9U3;DI3FUkoU	cU U U U U U: JNo o9o3;DI3Fo	co o o o0c d3i    
    
 
 
 c HSM ]bcf]g        r0   r	   N)rg   typingr   tokenization_utilsr   r   utilsr   
get_loggerrd   loggerr	   __all__r#   r0   r/   <module>rv      s    ( '       A A A A A A A A       
	H	%	%k k k k k, k k k\  
 r0   