
     `i*                         d dl Z d dlmZ d dlmZ d dlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ  e            rd	d
lmZ ndZ ej        e          ZdddZg dZ G d de          ZdgZdS )    N)copyfile)Optional)
processors   )
AddedTokenBatchEncoding)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )MBartTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                   "    e Zd ZU dZeZddgZeZg Z	e
e         ed<   g Ze
e         ed<   	 	 	 	 	 	 	 	 	 	 	 	 d& fd	Zedefd            Zej        deddfd            Z	 d'de
e         dee
e                  de
e         fdZ	 d'de
e         dee
e                  de
e         fdZdedee         dee         fdZ	 	 	 d(de
e         dedee
e                  dedef
 fdZd Zd Zd)d Zd!eddfd"Zd'd#ed$ee         dee         fd%Z xZS )*MBartTokenizerFastuO  
    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizerFast

    >>> tokenizer = MBartTokenizerFast.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokensN<s></s><unk><pad><mask>c                     t          |	t                    rt          |	dd          n|	}	t                                          |!                    fd|D                         t                      j        d|||||||||	|
|d| | _         fdt          D              _	        |
|
nd _
                              j
                   _        | _                              j
                   d S )	NTF)lstriprstripc                     g | ]}|v|	S  r8   ).0t_additional_special_tokenss     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/mbart/tokenization_mbart_fast.py
<listcomp>z/MBartTokenizerFast.__init__.<locals>.<listcomp>a   s$    ]]]qB\9\9\9\9\9\    )r   r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensc                 <    i | ]}|                     |          S r8   )convert_tokens_to_ids)r9   	lang_codeselfs     r<   
<dictcomp>z/MBartTokenizerFast.__init__.<locals>.<dictcomp>u   s6      
  
  
AJIt11)<< 
  
  
r>   r   r8   )
isinstancestrr   FAIRSEQ_LANGUAGE_CODEScopyextendsuper__init__r   lang_code_to_id	_src_langrJ   cur_lang_coderG   set_src_lang_special_tokens)rL   r   r   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   kwargsr;   	__class__s   `             @r<   rT   zMBartTokenizerFast.__init__I   sR   " KUU_adJeJeuZ
4FFFFku
%;%@%@%B%B"$0&--]]]]5]]]   	 	
!)!&@	
 	
 	
 	
 	
  % 
  
  
  
Nd 
  
  
 &.%9w!77GG ((88888r>   returnc                     | j         S N)rV   rL   s    r<   rF   zMBartTokenizerFast.src_lang~   s
    ~r>   new_src_langc                 H    || _         |                     | j                    d S r]   )rV   rX   )rL   r_   s     r<   rF   zMBartTokenizerFast.src_lang   s%    %((88888r>   token_ids_0token_ids_1c                 T    || j         |z   | j        z   S | j         |z   |z   | j        z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.

        An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r-   r.   )rL   ra   rb   s      r<    build_inputs_with_special_tokensz3MBartTokenizerFast.build_inputs_with_special_tokens   s;    0 %3d6HHH!K/+=@RRRr>   c                     | j         g}| j        g}|t          ||z   |z             dgz  S t          ||z   |z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.

        Nr   )sep_token_idcls_token_idlen)rL   ra   rb   sepclss        r<   $create_token_type_ids_from_sequencesz7MBartTokenizerFast.create_token_type_ids_from_sequences   sm    $  ! !s[(3.//1#553$s*S0;>DEEKKr>   return_tensorsrF   rG   c                     ||t          d          || _         | |fd|d|}|                     |          }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensrl   forced_bos_token_id)
ValueErrorrF   rJ   )rL   
raw_inputsrl   rF   rG   extra_kwargsinputstgt_lang_ids           r<   _build_translation_inputsz,MBartTokenizerFast._build_translation_inputs   sg     x/`aaa jiT.ii\hii00::(3$%r>   r   r#   	src_texts	tgt_textsc                 V    || _         || _         t                      j        ||fi |S r]   )rF   rG   rS   prepare_seq2seq_batch)rL   rv   rF   rw   rG   rY   rZ   s         r<   ry   z(MBartTokenizerFast.prepare_seq2seq_batch   s4     ! ,uww,Y	LLVLLLr>   c                 6    |                      | j                  S r]   )rX   rF   r^   s    r<   _switch_to_input_modez(MBartTokenizerFast._switch_to_input_mode       //>>>r>   c                 6    |                      | j                  S r]   )set_tgt_lang_special_tokensrG   r^   s    r<   _switch_to_target_modez)MBartTokenizerFast._switch_to_target_mode   r|   r>   c                    |                      |          | _        g | _        | j        | j        g| _        |                     | j                  }|                     | j                  }t          j        |dgz   |z   |ddgz   |z   t          t          ||z   | j        | j        z                                 | j
        _        dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$Bsinglepairspecial_tokensNrJ   rW   r-   eos_token_idr.   convert_ids_to_tokensr   TemplateProcessinglistzip
_tokenizerpost_processor)rL   rF   prefix_tokens_strsuffix_tokens_strs       r<   rX   z.MBartTokenizerFast.set_src_lang_special_tokens   s    !77AA"/1CD 66t7IJJ 66t7IJJ)3)F$v-0AA"dD\14EE$58I$I4K]`d`rKr s stt*
 *
 *
&&&r>   langc                    |                      |          | _        g | _        | j        | j        g| _        |                     | j                  }|                     | j                  }t          j        |dgz   |z   |ddgz   |z   t          t          ||z   | j        | j        z                                 | j
        _        dS )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].r   r   r   Nr   )rL   r   r   r   s       r<   r~   z.MBartTokenizerFast.set_tgt_lang_special_tokens   s    !77=="/1CD 66t7IJJ 66t7IJJ)3)F$v-0AA"dD\14EE$58I$I4K]`d`rKr s stt*
 *
 *
&&&r>   save_directoryfilename_prefixc                    | j         st          d          t          j                            |          s t
                              d| d           d S t          j                            ||r|dz   ndt          d         z             }t          j        	                    | j
                  t          j        	                    |          k    rt          | j
        |           |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )can_save_slow_tokenizerrp   ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )rL   r   r   out_vocab_files       r<   save_vocabularyz"MBartTokenizerFast.save_vocabulary   s    + 	  
 w}}^,, 	LLU^UUUVVVFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNT_n555  r>   )NNr/   r0   r0   r/   r1   r2   r3   NNNr]   )r   Nr#   )r[   N) __name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   slow_tokenizer_classr-   r   int__annotations__r.   rT   propertyrO   rF   setterr   rd   rk   ru   r   ry   r{   r   rX   r~   tupler   __classcell__)rZ   s   @r<   r*   r*   *   s         . *$&67)!M49!!!!M49!!! "&39 39 39 39 39 39j #    X _9S 9T 9 9 9 _9
 JNS S9S3;DI3FS	cS S S S< JNL L9L3;DI3FL	cL L L L2
*-
9A#
RZ[^R_
 
 
 
  )-
M 
M9
M 
M DI&	
M
 
M 

M 
M 
M 
M 
M 
M? ? ?? ? ?
 
 
 

 
 
 
 
 
! !c !HSM !]bcf]g ! ! ! ! ! ! ! !r>   r*   )r   shutilr   typingr   
tokenizersr   tokenization_utilsr   r   tokenization_utils_fastr	   utilsr
   r   tokenization_mbartr   
get_loggerr   r   r   rP   r*   __all__r8   r>   r<   <module>r      s1    
			             ! ! ! ! ! ! ; ; ; ; ; ; ; ; > > > > > > 8 8 8 8 8 8 8 8  2222222N 
	H	%	% $=P`aa  {  {  { `! `! `! `! `!0 `! `! `!F  
 r>   