
    &`i                         d dl mZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ  ed           G d d	e                      ZdS )
    )CallableListOptionalN)Preprocessor)simple_split_tokenizer)	PublicAPIalpha)	stabilityc            	            e Zd ZdZdZ	 	 ddee         deeegee         f                  deee                  f fdZ	de
j        fd	Zd
 Z xZS )	Tokenizera  Replace each string with a list of tokens.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> df = pd.DataFrame({"text": ["Hello, world!", "foo bar\nbaz"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP

        The default ``tokenization_fn`` delimits strings using the space character.

        >>> from ray.data.preprocessors import Tokenizer
        >>> tokenizer = Tokenizer(columns=["text"])
        >>> tokenizer.transform(ds).to_pandas()  # doctest: +SKIP
                       text
        0  [Hello,, world!]
        1   [foo, bar\nbaz]

        If the default logic isn't adequate for your use case, you can specify a
        custom ``tokenization_fn``.

        >>> import string
        >>> def tokenization_fn(s):
        ...     for character in string.punctuation:
        ...         s = s.replace(character, "")
        ...     return s.split()
        >>> tokenizer = Tokenizer(columns=["text"], tokenization_fn=tokenization_fn)
        >>> tokenizer.transform(ds).to_pandas()  # doctest: +SKIP
                      text
        0   [Hello, world]
        1  [foo, bar, baz]

        :class:`Tokenizer` can also be used in append mode by providing the
        name of the output_columns that should hold the tokenized values.

        >>> tokenizer = Tokenizer(columns=["text"], output_columns=["text_tokenized"])
        >>> tokenizer.transform(ds).to_pandas()  # doctest: +SKIP
                    text    text_tokenized
        0  Hello, world!  [Hello,, world!]
        1   foo bar\nbaz   [foo, bar\nbaz]

    Args:
        columns: The columns to tokenize.
        tokenization_fn: The function used to generate tokens. This function
            should accept a string as input and return a list of tokens as
            output. If unspecified, the tokenizer uses a function equivalent to
            ``lambda s: s.split(" ")``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    FNcolumnstokenization_fnoutput_columnsc                     t                                                       || _        |pt          | _        t          j        ||          | _        d S N)super__init__r   r   r   r   #_derive_and_validate_output_columnsr   )selfr   r   r   	__class__s       t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/preprocessors/tokenizer.pyr   zTokenizer.__init__B   sP     	.H2H*N^
 
    dfc                      dt           j        f fd}|j        d d  j        f                             |          | j        <   |S )Nsc                 8    |                      j                  S r   )mapr   )r   r   s    r   column_tokenizerz5Tokenizer._transform_pandas.<locals>.column_tokenizerQ   s    55-...r   )pdSerieslocr   	transformr   )r   r   r   s   `  r   _transform_pandaszTokenizer._transform_pandasP   s^    	/	 	/ 	/ 	/ 	/ 	/ 	/ #%&DL"9"C"CDT"U"U4	r   c                 |    t          | j        d| j                  }| j        j         d| j        d| d| j        dS )N__name__z	(columns=z, tokenization_fn=z, output_columns=))getattrr   r   r%   r   r   )r   names     r   __repr__zTokenizer.__repr__W   si    t+Z9MNN~& O O O O#O O6:6IO O O	
r   )NN)r%   
__module____qualname____doc___is_fittabler   strr   r   r   r   	DataFramer#   r)   __classcell__)r   s   @r   r   r   
   s        2 2h L
 AE.2	
 
c
 "(C5$s)+;"<=
 !c+	
 
 
 
 
 
BL    
 
 
 
 
 
 
r   r   )typingr   r   r   pandasr   ray.data.preprocessorr   ray.data.preprocessors.utilsr   ray.util.annotationsr   r    r   r   <module>r7      s    + + + + + + + + + +     . . . . . . ? ? ? ? ? ? * * * * * * WQ
 Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
 Q
r   