
     `i/                         d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZ  ed	
           G d dej        j                              ZdgZdS )    N)OptionalUnion)BertTokenizer)FastBertTokenizerShrinkLongestTrimmercase_fold_utf8combine_segmentspad_model_inputs   )keras)requires   )tftensorflow_text)backendsc                   
    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededee         d	ee         d
ee         dedededee         dededef fdZ	e
dd            Ze
deeej        f         fd            Zd Z	 	 	 	 	 	 	 ddZd Z xZS )TFBertTokenizera  
    This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
    from an existing standard tokenizer object.

    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
    straight from `tf.string` inputs to outputs.

    Args:
        vocab_list (`list`):
            List containing the vocabulary.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        cls_token_id (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        sep_token_id (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token_id (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        padding (`str`, defaults to `"longest"`):
            The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch,
            or `"max_length", to pad all inputs to the maximum length supported by the tokenizer.
        truncation (`bool`, *optional*, defaults to `True`):
            Whether to truncate the sequence to the maximum length.
        max_length (`int`, *optional*, defaults to `512`):
            The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if
            `truncation` is `True`).
        pad_to_multiple_of (`int`, *optional*, defaults to `None`):
            If set, the sequence will be padded to a multiple of this value.
        return_token_type_ids (`bool`, *optional*, defaults to `True`):
            Whether to return token_type_ids.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention_mask.
        use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`):
            If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer
            class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to
            TFLite.
    NlongestT   
vocab_listdo_lower_casecls_token_idsep_token_idpad_token_idpadding
truncation
max_lengthpad_to_multiple_ofreturn_token_type_idsreturn_attention_maskuse_fast_bert_tokenizerc                 D   t                                                       |r t          |ft          j        |d|| _        nt          j                            t          j                            |t          j	        t          j
        t          j        |t          j                  t          j                  t          j                  d          }t          |ft          j        |d|| _        || _        || _        ||                    d          n|| _        ||                    d	          n|| _        ||                    d
          n|| _        t'          |dz
  d          | _        || _        || _        || _        |	| _        |
| _        || _        d S )N)token_out_typelower_case_nfd_strip_accents)out_type)dtype)keys	key_dtypevaluesvalue_dtyper   )num_oov_buckets)r#   
lower_casez[CLS]z[SEP]z[PAD]r   axis)super__init__r   r   int64tf_tokenizerlookupStaticVocabularyTableKeyValueTensorInitializerstringrangesizeBertTokenizerLayerr   r   indexr   r   r   r   paired_trimmerr   r   r   r   r   r    )selfr   r   r   r   r   r   r   r   r   r   r    r!   tokenizer_kwargslookup_table	__class__s                  /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/transformers/models/bert/tokenization_bert_tf.pyr0   zTFBertTokenizer.__init__;   s     	" 	 1!+-8R_! !cs! !D 9::	33# i8BGJ$J$J$JRTRZ[[[ "	 4   !" ;  L !3!-/X-! !Sc! !D %*9E9MJ,,W555S_9E9MJ,,W555S_9E9MJ,,W555S_2:>JJJ$$"4%:"%:"""    	tokenizerPreTrainedTokenizerBasec           	         |                     dd          }||j        n|}|                     dd          }||j        n|}|                     dd          }||j        n|}|                     dd          }||j        n|}|                                }t          |                                d           }d |D             } | d
|||||d	|S )a  
        Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

        Args:
            tokenizer (`PreTrainedTokenizerBase`):
                The tokenizer to use to initialize the `TFBertTokenizer`.

        Examples:

        ```python
        from transformers import AutoTokenizer, TFBertTokenizer

        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
        ```
        r   Nr   r   r   c                     | d         S )Nr    )xs    r@   <lambda>z0TFBertTokenizer.from_tokenizer.<locals>.<lambda>   s
    AaD rA   )keyc                     g | ]
}|d          S )r   rF   ).0entrys     r@   
<listcomp>z2TFBertTokenizer.from_tokenizer.<locals>.<listcomp>   s    2225eAh222rA   r   r   r   r   r   rF   )popr   r   r   r   	get_vocabsorteditems)	clsrB   kwargsr   r   r   r   vocabr   s	            r@   from_tokenizerzTFBertTokenizer.from_tokenizerk   s   $ 

?D993@3H	//mzz.$771=1Ey--<zz.$771=1Ey--<zz.$771=1Ey--<##%%u{{}}..99922E222
s 
!'%%%
 
 
 
 	
rA   pretrained_model_name_or_pathc                     	 t          j        |g|R i |}n#  ddlm}  |j        |g|R i |}Y nxY w | j        |fi |S )a  
        Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name or path to the pre-trained tokenizer.

        Examples:

        ```python
        from transformers import TFBertTokenizer

        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        ```
        r   )BertTokenizerFast)r   from_pretrainedtokenization_bert_fastrY   rV   )rS   rW   init_inputsrT   rB   rY   s         r@   rZ   zTFBertTokenizer.from_pretrained   s    "	q%56SlValllekllII	qAAAAAA9)9:WpZepppioppIII!s!)66v666s    3c                     | j         rt          |          }| j                            |          }|                    dd          S )Nr   )r   r   r2   tokenize
merge_dims)r<   textstokenss      r@   unpaired_tokenizez!TFBertTokenizer.unpaired_tokenize   sF     	*"5))E"++E22  B'''rA   c	                    || j         }|dvrt          d          ||t          d          || j        }|| j        }|| j        }|| j        }|| j        }t          |t          j	                  st          j
        |          }|.t          |t          j	                  st          j
        |          }|>|j        j        dk    rt          d          |j        j        dk    rt          d          |j        j        dk    r|d d df         |d d df         }}|                     |          }|4|r|d d d |dz
  f         }t          |f| j        | j        	          \  }	}
nW|                     |          }|r| j                            ||g          \  }}t          ||f| j        | j        	          \  }	}
|d
k    r>|	                    d          }|%|t          j                            | |           z  }n|}t-          |	|| j                  \  }	}d|	i}|r||d<   |rt-          |
|| j                  \  }
}|
|d<   |S )N)r   r   z1Padding must be either 'longest' or 'max_length'!zJmax_length cannot be overridden at call time when truncating paired texts!r   zJtext argument should not be multidimensional when a text pair is supplied!z)text_pair should not be multidimensional!   r   )start_of_sequence_idend_of_segment_idr   r-   )max_seq_length	pad_value	input_idsattention_masktoken_type_ids)r   
ValueErrorr   r   r   r   r    
isinstancer   Tensorconvert_to_tensorshaperankrc   r	   r   r   r;   trimbounding_shapemathfloordivr
   r   )r<   text	text_pairr   r   r   r   r   r    rj   rl   
pad_lengthrk   output_s                  r@   callzTFBertTokenizer.call   s    ?lG333PQQQ!i&;ijjjJJ%!%!8 ($($>! ($($>!$	** 	.'--D Iry)I)I ,Y77I z"" !mnnn#a'' !LMMM:?a"111a4j$qqq!t*)D%%d++ 1AAA/a//0(8d.?SWSd) ) )%I~~ ..y99I N"&"5":":D);L"M"Mi(8y!8I]a]n) ) )%I~ i"11q199J!-/BG4D4Dj[Rd4e4e3ef
#J$4Yzeiev$w$w$w!	>y)  	6'5F#$  	6 0zTEV! ! !NA (6F#$rA   c                 D    | j         | j        | j        | j        | j        dS )NrN   rN   )r<   s    r@   
get_configzTFBertTokenizer.get_config   s.    /!/ - - -
 
 	
rA   )
NNNr   Tr   NTTT)rB   rC   )NNNNNNN)__name__
__module____qualname____doc__listboolr   intstrr0   classmethodrV   r   osPathLikerZ   rc   r|   r~   __classcell__)r?   s   @r@   r   r      s       * *` '+&*&* ,0&*&*(,.; .;.; .; sm	.;
 sm.; sm.; .; .; .; %SM.;  $.;  $.; "&.; .; .; .; .; .;` $
 $
 $
 [$
L 7E#r{BR<S 7 7 7 [70( ( ( ""F F F FP
 
 
 
 
 
 
rA   r   )r   typingr   r   
tensorflowr   r   r   r9   r   r   r   r	   r
   modeling_tf_utilsr   utils.import_utilsr   tokenization_bertlayersLayerr   __all__rF   rA   r@   <module>r      s   				 " " " " " " " "     ? ? ? ? ? ? w w w w w w w w w w w w w w & & & & & & * * * * * * , , , , , , 
,---r
 r
 r
 r
 r
el( r
 r
 .-r
j 
rA   