
    &`i                         d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lm Z  erd dl!m"Z"  e j#        e$          Z% e d           edd           G d de                                  Z& e d           edd           G d de                                  Z' e d           edd           G d de                                  Z( e d           edd           G d de                                  Z) e d           edd           G d de                                  Z*ddd d!d"d#ee+         d$ed%e,d&ee	e+e-f                  f
d'Z.d0d)e,d*eege	e+e-f         f         fd+Z/d,ej0        d#e+d*dfd-Z1d.ej2        d*e,fd/Z3dS )1    N)Counter)partial)TYPE_CHECKINGAnyCallableDictHashableListOptionalSet)BatchFormatis_null)PreprocessorPreprocessorNotFittedExceptionSerializablePreprocessorBase)make_post_processor)SerializablePreprocessor)	PublicAPI)Datasetalpha)	stability   z$io.ray.preprocessors.ordinal_encoder)version
identifierc            	            e Zd ZdZddddee         dedeee                  f fdZd	d
de	fdZ
dedefdZdej        fdZdeeef         fdZdeeef         defdZd Z xZS )OrdinalEncodera  Encode values within columns as ordered integer values.

    :class:`OrdinalEncoder` encodes categorical features as integers that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of categories.

    If you transform a value that isn't in the fitted datset, then the value is encoded
    as ``float("nan")``.

    Columns must contain either hashable values or lists of hashable values. Also, you
    can't have both scalars and lists in the same column.

    Examples:
        Use :class:`OrdinalEncoder` to encode categorical features as integers.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OrdinalEncoder
        >>> df = pd.DataFrame({
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["sex", "level"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    1      1
        1    0      2
        2    1      0
        3    0      1

        :class:`OrdinalEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OrdinalEncoder(columns=["sex", "level"], output_columns=["sex_encoded", "level_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level  sex_encoded  level_encoded
        0    male    L4            1              1
        1  female    L5            0              2
        2    male    L3            1              0
        3  female    L4            0              1


        If you transform a value not present in the original dataset, then the value
        is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({"sex": ["female"], "level": ["L6"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    0    NaN

        :class:`OrdinalEncoder` can also encode categories in a list.

        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [2, 0, 4]
        1                          Moana  [1, 2, 0]
        2  The Smartest Guys in the Room        [3]

    Args:
        columns: The columns to separately encode.
        encode_lists: If ``True``, encode list elements.  If ``False``, encode
            whole lists (i.e., replace each list with an integer). ``True``
            by default.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            Another preprocessor that encodes categorical data.
    TN)encode_listsoutput_columnscolumnsr   r   c                    t                                                       || _        || _        t	          j        ||          | _        d S N)super__init__r    r   r   #_derive_and_validate_output_columnsr   )selfr    r   r   	__class__s       r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/preprocessors/encoder.pyr$   zOrdinalEncoder.__init__t   sJ     	(*N^
 
    datasetr   returnc                 v      j                              fdt                      d d  j                    S )Nc                 >    t          j        j        |           S )N)r*   r    r   key_gen)compute_unique_value_indicesr    r   r.   r*   r&   s    r(   <lambda>z%OrdinalEncoder._fit.<locals>.<lambda>   s(    $@!.	% % % r)   c                     d|  dS Nzunique() cols    r(   r1   z%OrdinalEncoder._fit.<locals>.<lambda>       $4c$4$4$4 r)   c                     d|  dS Nunique_values(r4   r5   r6   s    r(   r1   z%OrdinalEncoder._fit.<locals>.<lambda>       $;S$;$;$; r)   stat_fnpost_process_fnstat_key_fnpost_key_fnr    stat_computation_planadd_callable_statunique_post_fnr    r&   r*   s   ``r(   _fitzOrdinalEncoder._fit   s`    "44     +,,44;;L 	5 	
 	
 	
 r)   elementcolumn_namec                    | j         d| d         | j        rfd|D             S                     t          |                    S )Nr;   r4   c                 :    g | ]}                     |          S r5   get).0xordinal_maps     r(   
<listcomp>z7OrdinalEncoder._encode_list_element.<locals>.<listcomp>   s%    8881KOOA&&888r)   )stats_r   rM   tuple)r&   rH   rI   rP   s      @r(   _encode_list_elementz#OrdinalEncoder._encode_list_element   sZ    k"A;"A"A"AB  	988888888uW~~...r)   dfc                      t          |g j        R   dt          j        f fd}| j                                     |          | j        <   |S )Nsc                      t                     r                      fd          S j        d j         d         }                     |          S )Nc                 <                         | j                  S )N)rI   )rT   name)elemrW   r&   s    r(   r1   zROrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder.<locals>.<lambda>   s    !:!:4QV!:!T!T r)   r;   r4   )_is_series_composed_of_listsmaprR   rZ   rW   s_valuesr&   s   ` r(   column_ordinal_encoderz@OrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder   se    +A.. uuTTTTT   {#=AF#=#=#=>H55??"r)   )_validate_dfr    pdSeriesapplyr   )r&   rU   r`   s   `  r(   _transform_pandasz OrdinalEncoder._transform_pandas   sl    R'$,''''	#bi 	# 	# 	# 	# 	# 	# #%T\"2"8"89O"P"P4	r)   c                 L    | j         | j        | j        t          | dd           dS )N_fitted)r    r   r   rg   )r    r   r   getattrr&   s    r(   _get_serializable_fieldsz'OrdinalEncoder._get_serializable_fields   s1    |"1 -tY55	
 
 	
r)   fieldsr   c                     |d         | _         |d         | _        |d         | _        |                    d          | _        d S )Nr    r   r   rg   )r    r   r   rM   rg   r&   rk   r   s      r(   _set_serializable_fieldsz'OrdinalEncoder._set_serializable_fields   s@    i($%56">2zz),,r)   c                 P    | j         j         d| j        d| j        d| j        dS )N	(columns=z, encode_lists=, output_columns=r4   )r'   __name__r    r   r   ri   s    r(   __repr__zOrdinalEncoder.__repr__   sL    ~& 7 7 7 7 -7 7"17 7 7	
r)   )rr   
__module____qualname____doc__r
   strboolr   r$   r   rG   listrT   rb   	DataFramere   r   r   rj   intrn   rs   __classcell__r'   s   @r(   r   r      sB       S Sr ".2
 
 
c
 	

 !c+
 
 
 
 
 
I ,    /D /# / / / /BL    
$sCx. 
 
 
 
-tCH~ - - - - -
 
 
 
 
 
 
r)   r   z$io.ray.preprocessors.one_hot_encoderc            	            e Zd ZdZddddee         deeeef                  deee                  f fdZ	dd	d
e
fdZdedeeef         fdZdej        fdZd
eeef         fdZdeeef         defdZd Z xZS )OneHotEncodera-  `One-hot encode <https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics>`_
    categorical data.

    This preprocessor transforms each specified column into a one-hot encoded vector.
    Each element in the vector corresponds to a unique category in the column, with a
    value of 1 if the category matches and 0 otherwise.

    If a category is infrequent (based on ``max_categories``) or not present in the
    fitted dataset, it is encoded as all 0s.

    Columns must contain hashable objects or lists of hashable objects.

    .. note::
        Lists are treated as categories. If you want to encode individual list
        elements, use :class:`MultiHotEncoder`.

    Example:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OneHotEncoder
        >>>
        >>> df = pd.DataFrame({"color": ["red", "green", "red", "red", "blue", "green"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OneHotEncoder(columns=["color"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
               color
        0  [0, 0, 1]
        1  [0, 1, 0]
        2  [0, 0, 1]
        3  [0, 0, 1]
        4  [1, 0, 0]
        5  [0, 1, 0]

        OneHotEncoder can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OneHotEncoder(columns=["color"], output_columns=["color_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           color color_encoded
        0    red     [0, 0, 1]
        1  green     [0, 1, 0]
        2    red     [0, 0, 1]
        3    red     [0, 0, 1]
        4   blue     [1, 0, 0]
        5  green     [0, 1, 0]

        If you one-hot encode a value that isn't in the fitted dataset, then the
        value is encoded with zeros.

        >>> df = pd.DataFrame({"color": ["yellow"]})
        >>> batch = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(batch).to_pandas()  # doctest: +SKIP
            color color_encoded
        0  yellow     [0, 0, 0]

        Likewise, if you one-hot encode an infrequent value, then the value is encoded
        with zeros.

        >>> encoder = OneHotEncoder(columns=["color"], max_categories={"color": 2})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
            color
        0  [1, 0]
        1  [0, 1]
        2  [1, 0]
        3  [1, 0]
        4  [0, 0]
        5  [0, 1]

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`MultiHotEncoder`
            If you want to encode individual list elements, use
            :class:`MultiHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.
    Nmax_categoriesr   r    r   r   c                    t                                                       || _        |pi | _        t	          j        ||          | _        d S r"   r#   r$   r    r   r   r%   r   r&   r    r   r   r'   s       r(   r$   zOneHotEncoder.__init__  O     	,2*N^
 
r)   r*   r   r+   c                 v      j                              fdt                      d d  j                    S )Nc                 @    t          j        d| j                  S )NFr*   r    r   r.   r   r/   r    r   r0   s    r(   r1   z$OneHotEncoder._fit.<locals>.<lambda>/  s+    $@"#2% % % r)   c                     d|  dS r3   r5   r6   s    r(   r1   z$OneHotEncoder._fit.<locals>.<lambda>7  r8   r)   c                     d|  dS r:   r5   r6   s    r(   r1   z$OneHotEncoder._fit.<locals>.<lambda>8  r<   r)   r=   rB   rF   s   ``r(   rG   zOneHotEncoder._fit-  `    "44     +,,44;;L 	5 	
 	
 	
 r)   vstatsc                     t          |t          t          j        f          rt	          |          }t          |t
                    r|                    |d          S dS )N)
isinstancery   npndarrayrS   r	   rM   )r&   r   r   s      r(   safe_getzOneHotEncoder.safe_get=  sQ    a$
+,, 	aAa"" 	99Q###2r)   rU   c                    	 t          |g j        R   t           j         j                  D ]\  }} j        d| d         	t          	          }t          j        t          |          |ft          j                  }||         	                     	fd          
                                }|dk    }t          j        |          d         }d||||         f<   |                                ||<   |S )Nr;   r4   )dtypec                 0                         |           S r"   )r   )r   r&   r   s    r(   r1   z1OneHotEncoder._transform_pandas.<locals>.<lambda>N  s    t}}Q/F/F r)   r   r   r   )ra   r    zipr   rR   lenr   zerosuint8rd   to_numpynonzerotolist)
r&   rU   columnoutput_columnnum_categoriesone_hotcodesvalid_category_masknon_zero_indicesr   s
   `        @r(   re   zOneHotEncoder._transform_pandasE  s   R'$,'''' &)t7J%K%K 	1 	1!FMK : : : :;E ZZNhB8IIIGvJ$$%F%F%F%F%FGGPPRRE"'2+!z*=>>qA
   )*, !( 0 0B}	r)   c                 L    | j         | j        | j        t          | dd           dS Nrg   )r    r   r   rg   r    r   r   rh   ri   s    r(   rj   z&OneHotEncoder._get_serializable_fields\  1    |"1"1tY55	
 
 	
r)   rk   r   c                     |d         | _         |d         | _        |d         | _        |                    d          | _        d S Nr    r   r   rg   r    r   r   rM   rg   rm   s      r(   rn   z&OneHotEncoder._set_serializable_fieldsd  A    i($%56$%56zz),,r)   c                 P    | j         j         d| j        d| j        d| j        dS Nrp   z, max_categories=rq   r4   r'   rr   r    r   r   ri   s    r(   rs   zOneHotEncoder.__repr__l  sL    ~& 7 7 7 7"17 7"17 7 7	
r)   )rr   rt   ru   rv   r
   rw   r   r   r{   r$   r   rG   r   r   rb   rz   re   rj   rn   rs   r|   r}   s   @r(   r   r      sW       W Wz 48.2
 
 
c
 !c3h0	

 !c+
 
 
 
 
 
I ,     # d38n    BL    .
$sCx. 
 
 
 
-tCH~ - - - - -
 
 
 
 
 
 
r)   r   z&io.ray.preprocessors.multi_hot_encoderc            	            e Zd ZdZddddee         deeeef                  deee                  f fdZ	dd	d
e
fdZdej        fdZd
eeef         fdZdeeef         defdZd Z xZS )MultiHotEncodera  Multi-hot encode categorical data.

    This preprocessor replaces each list of categories with an :math:`m`-length binary
    list, where :math:`m` is the number of unique categories in the column or the value
    specified in ``max_categories``. The :math:`i\\text{-th}` element of the binary list
    is :math:`1` if category :math:`i` is in the input list and :math:`0` otherwise.

    Columns must contain hashable objects or lists of hashable objects.
    Also, you can't have both types in the same column.

    .. note::
        The logic is similar to scikit-learn's [MultiLabelBinarizer][1]

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import MultiHotEncoder
        >>>
        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> encoder = MultiHotEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name            genre
        0                 Shaolin Soccer  [1, 0, 1, 0, 1]
        1                          Moana  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room  [0, 0, 0, 1, 0]

        :class:`MultiHotEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = MultiHotEncoder(columns=["genre"], output_columns=["genre_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name                        genre    genre_encoded
        0                 Shaolin Soccer     [comedy, action, sports]  [1, 0, 1, 0, 1]
        1                          Moana  [animation, comedy, action]  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room                [documentary]  [0, 0, 0, 1, 0]

        If you specify ``max_categories``, then :class:`MultiHotEncoder`
        creates features for only the most frequent categories.

        >>> encoder = MultiHotEncoder(columns=["genre"], max_categories={"genre": 3})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [1, 1, 1]
        1                          Moana  [1, 1, 0]
        2  The Smartest Guys in the Room  [0, 0, 0]
        >>> encoder.stats_  # doctest: +SKIP
        OrderedDict([('unique_values(genre)', {'comedy': 0, 'action': 1, 'sports': 2})])

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every unique category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            If you're encoding individual categories instead of lists of
            categories, use :class:`OneHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.

    [1]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
    Nr   r    r   r   c                    t                                                       || _        |pi | _        t	          j        ||          | _        d S r"   r   r   s       r(   r$   zMultiHotEncoder.__init__  r   r)   r*   r   r+   c                 v      j                              fdt                      d d  j                    S )Nc                 @    t          j        d| j                  S )NTr   r   r0   s    r(   r1   z&MultiHotEncoder._fit.<locals>.<lambda>  s+    $@!#2% % % r)   c                     d|  dS r3   r5   r6   s    r(   r1   z&MultiHotEncoder._fit.<locals>.<lambda>  r8   r)   c                     d|  dS r:   r5   r6   s    r(   r1   z&MultiHotEncoder._fit.<locals>.<lambda>  r<   r)   r=   rB   rF   s   ``r(   rG   zMultiHotEncoder._fit  r   r)   rU   c                      t          |g j        R   dt          dt          f fd}t	           j         j                  D ]2\  }}||                             t          ||                    ||<   3|S )NrH   rZ   c                    t          | t          j                  r|                                 } nt          | t                    s| g} j        d| d         }t          |           fd|D             S )Nr;   r4   c                 <    g | ]}                     |d           S )r   rL   )rN   rO   counters     r(   rQ   zJMultiHotEncoder._transform_pandas.<locals>.encode_list.<locals>.<listcomp>  s'    555!GKK1%%555r)   )r   r   r   r   ry   rR   r   )rH   rZ   r   r   r&   s      @r(   encode_listz6MultiHotEncoder._transform_pandas.<locals>.encode_list  s    '2:.. $!..**.. $")K 8 8 8 89Eg&&G5555u5555r)   )rZ   )ra   r    ry   rw   r   r   r]   r   )r&   rU   r   r   r   s   `    r(   re   z!MultiHotEncoder._transform_pandas  s    R'$,''''	6 	6 	6 	6 	6 	6 	6 	6 &)t7J%K%K 	R 	R!FM "6
w{/P/P/P Q QB}	r)   c                 L    | j         | j        | j        t          | dd           dS r   r   ri   s    r(   rj   z(MultiHotEncoder._get_serializable_fields  r   r)   rk   r   c                     |d         | _         |d         | _        |d         | _        |                    d          | _        d S r   r   rm   s      r(   rn   z(MultiHotEncoder._set_serializable_fields  r   r)   c                 P    | j         j         d| j        d| j        d| j         dS r   r   ri   s    r(   rs   zMultiHotEncoder.__repr__	  sL    ~& 5 5 5 5"15 5"15 5 5	
r)   )rr   rt   ru   rv   r
   rw   r   r   r{   r$   r   rG   rb   rz   re   r   rj   rn   rs   r|   r}   s   @r(   r   r   t  s-       
N Nh 48.2
 
 
c
 !c3h0	

 !c+
 
 
 
 
 
I ,     BL    "
$sCx. 
 
 
 
-tCH~ - - - - -
 
 
 
 
 
 
r)   r   z"io.ray.preprocessors.label_encoderc                        e Zd ZdZdddedee         f fdZddd	efd
Zde	j
        fdZddZde	j
        fdZd	ee         fdZd	ee         fdZd	eeef         fdZdeeef         defdZd Z xZS )LabelEncodera
  Encode labels as integer targets.

    :class:`LabelEncoder` encodes labels as integer targets that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of unique labels.

    If you transform a label that isn't in the fitted datset, then the label is encoded
    as ``float("nan")``.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> df = pd.DataFrame({
        ...     "sepal_width": [5.1, 7, 4.9, 6.2],
        ...     "sepal_height": [3.5, 3.2, 3, 3.4],
        ...     "species": ["setosa", "versicolor", "setosa", "virginica"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> from ray.data.preprocessors import LabelEncoder
        >>> encoder = LabelEncoder(label_column="species")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          5.1           3.5        0
        1          7.0           3.2        1
        2          4.9           3.0        0
        3          6.2           3.4        2

        You can also provide the name of the output column that should hold the encoded
        labels if you want to use :class:`LabelEncoder` in append mode.

        >>> encoder = LabelEncoder(label_column="species", output_column="species_encoded")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height     species  species_encoded
        0          5.1           3.5      setosa                0
        1          7.0           3.2  versicolor                1
        2          4.9           3.0      setosa                0
        3          6.2           3.4   virginica                2

        If you transform a label not present in the original dataset, then the new
        label is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({
        ...     "sepal_width": [4.2],
        ...     "sepal_height": [2.7],
        ...     "species": ["bracteata"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          4.2           2.7      NaN

    Args:
        label_column: A column containing labels that you want to encode.
        output_column: The name of the column that will contain the encoded
            labels. If None, the output column will have the same name as the
            input column.

    .. seealso::

        :class:`OrdinalEncoder`
            If you're encoding ordered features, use :class:`OrdinalEncoder` instead of
            :class:`LabelEncoder`.
    Nr   label_columnr   c                h    t                                                       || _        |p|| _        d S r"   )r#   r$   r   r   )r&   r   r   r'   s      r(   r$   zLabelEncoder.__init__T  s4    (*:lr)   r*   r   r+   c                 x      j                              fdt                      d d  j        g            S )Nc                 4    t          j        g|           S N)r*   r    r.   )r/   r   r0   s    r(   r1   z#LabelEncoder._fit.<locals>.<lambda>[  s&    $@*+% % % r)   c                     d|  dS r3   r5   r6   s    r(   r1   z#LabelEncoder._fit.<locals>.<lambda>a  r8   r)   c                     d|  dS r:   r5   r6   s    r(   r1   z#LabelEncoder._fit.<locals>.<lambda>b  r<   r)   r=   )rC   rD   rE   r   rF   s   ``r(   rG   zLabelEncoder._fitY  sd    "44    
 +,,44;;&' 	5 
	
 
	
 
	
 r)   rU   c                      t          | j                   dt          j        f fd}| j                                     |          | j        <   |S )NrW   c                 Z    j         d| j         d         }|                     |          S r:   )rR   rZ   r]   r^   s     r(   column_label_encoderz<LabelEncoder._transform_pandas.<locals>.column_label_encoderj  s-    {#=AF#=#=#=>H55??"r)   )ra   r   rb   rc   	transformr   )r&   rU   r   s   `  r(   re   zLabelEncoder._transform_pandasg  sh    R*+++	#BI 	# 	# 	# 	# 	# 	# "$D$5!6!@!@AU!V!V4	r)   dsc                     |                                  }|t          j        j        t          j        j        fv rt          d          |                                 } |j        | j        fdt          j
        i|S )a/  Inverse transform the given dataset.

        Args:
            ds: Input Dataset that has been fitted and/or transformed.

        Returns:
            ray.data.Dataset: The inverse transformed Dataset.

        Raises:
            PreprocessorNotFittedException: if ``fit`` is not called yet.
        z1`fit` must be called before `inverse_transform`, batch_format)
fit_statusr   	FitStatusPARTIALLY_FITTED
NOT_FITTEDr   _get_transform_configmap_batches_inverse_transform_pandasr   PANDAS)r&   r   r   kwargss       r(   inverse_transformzLabelEncoder.inverse_transformq  s     __&&
"3"-
 
 
 1C   ++--r~*
 
9D9K
OU
 
 	
r)   c                 |     dt           j        f fd}| j                                     |          | j        <   |S )NrW   c                     d j         dj         d                                         D             }|                     |          S )Nc                     i | ]\  }}||	S r5   r5   )rN   keyvalues      r(   
<dictcomp>zXLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder.<locals>.<dictcomp>  s.       C s  r)   r;   r4   )rR   r   itemsr]   )rW   inverse_valuesr&   s     r(   column_label_decoderzDLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder  sV     "&+9T%6999#%''	  N 55(((r)   )rb   rc   r   r   r   )r&   rU   r   s   `  r(   r   z&LabelEncoder._inverse_transform_pandas  sU    	)BI 	) 	) 	) 	) 	) 	) !#4#5 6 @ @AU V V4	r)   c                     | j         gS r"   )r   ri   s    r(   get_input_columnszLabelEncoder.get_input_columns  s    !""r)   c                     | j         gS r"   r   ri   s    r(   get_output_columnszLabelEncoder.get_output_columns  s    "##r)   c                 @    | j         | j        t          | dd           dS )Nrg   )r   r   rg   )r   r   rh   ri   s    r(   rj   z%LabelEncoder._get_serializable_fields  s,     -!/tY55
 
 	
r)   rk   r   c                 n    |d         | _         |d         | _        |                    d          | _        d S )Nr   r   rg   )r   r   rM   rg   rm   s      r(   rn   z%LabelEncoder._set_serializable_fields  s3    ">2#O4zz),,r)   c                 @    | j         j         d| j        d| j        dS )Nz(label_column=z, output_column=r4   )r'   rr   r   r   ri   s    r(   rs   zLabelEncoder.__repr__  s-    .)uu9Juu^b^puuuur)   )r   r   r+   r   )rr   rt   ru   rv   rw   r   r$   r   rG   rb   rz   re   r   r   r
   r   r   r   r   rj   r{   rn   rs   r|   r}   s   @r(   r   r     s|       > >@ MQ ; ; ;S ;HSM ; ; ; ; ; ;
I ,    BL    
 
 
 
:BL    #49 # # # #$DI $ $ $ $
$sCx. 
 
 
 
-tCH~ - - - - -v v v v v v vr)   r   z io.ray.preprocessors.categorizerc            	            e Zd ZdZ	 	 ddee         deeeej	        f                  deee                  f fdZ
ddd	efd
Zdej        fdZd	eeef         fdZdeeef         defdZd Z xZS )Categorizera^
  Convert columns to ``pd.CategoricalDtype``.

    Use this preprocessor with frameworks that have built-in support for
    ``pd.CategoricalDtype`` like LightGBM.

    .. warning::

        If you don't specify ``dtypes``, fit this preprocessor before splitting
        your dataset into train and test splits. This ensures categories are
        consistent across splits.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import Categorizer
        >>>
        >>> df = pd.DataFrame(
        ... {
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> categorizer = Categorizer(columns=["sex", "level"])
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5'], ordered=False)]

        :class:`Categorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the categorized values.

        >>> categorizer = Categorizer(columns=["sex", "level"], output_columns=["sex_cat", "level_cat"])
        >>> categorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level sex_cat level_cat
        0    male    L4    male        L4
        1  female    L5  female        L5
        2    male    L3    male        L3
        3  female    L4  female        L4

        If you know the categories in advance, you can specify the categories with the
        ``dtypes`` parameter.

        >>> categorizer = Categorizer(
        ...     columns=["sex", "level"],
        ...     dtypes={"level": pd.CategoricalDtype(["L3", "L4", "L5", "L6"], ordered=True)},
        ... )
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5', 'L6'], ordered=True)]

    Args:
        columns: The columns to convert to ``pd.CategoricalDtype``.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects. If you don't include a column in ``dtypes``, the categories
            are inferred.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Nr    dtypesr   c                     t                                                       |si }|| _        || _        t	          j        ||          | _        d S r"   )r#   r$   r    r   r   r%   r   )r&   r    r   r   r'   s       r(   r$   zCategorizer.__init__  sV     	 	F*N^
 
r)   r*   r   r+   c                 <     fd j         D              xj         j        z  c_        s S dt          t          t          f         dt
          j        fd} j                            fdt          t          d          |g          d	 d
             S )Nc                 &    g | ]}|j         v|S r5   )r   )rN   r   r&   s     r(   rQ   z$Categorizer._fit.<locals>.<listcomp>  s,     
 
 
t{1J1JF1J1J1Jr)   unique_indicesr+   c                 N    t          j        |                                           S r"   )rb   CategoricalDtypekeys)r   s    r(   callbackz"Categorizer._fit.<locals>.callback	  s    &~':':'<'<===r)   c                 (    t          |           S r   )r/   )r.   columns_to_getr*   s    r(   r1   z"Categorizer._fit.<locals>.<lambda>  s     $@&% % % r)   T)drop_na_values)base_fn	callbacksc                     d|  dS r3   r5   r6   s    r(   r1   z"Categorizer._fit.<locals>.<lambda>  r8   r)   c                     | S r"   r5   r6   s    r(   r1   z"Categorizer._fit.<locals>.<lambda>  s    C r)   r=   )r    rR   r   r   rw   rb   r   rC   rD   r   rE   )r&   r*   r   r   s   `` @r(   rG   zCategorizer._fit  s    
 
 
 
!%
 
 
 	t{" 	K	>T#t)_ 	>9L 	> 	> 	> 	> 	"44    
 0&d;;;#*   54'" 	5 	
 	
 	
 r)   rU   c                 `    || j                                      | j                  || j        <   |S r"   )r    astyperR   r   )r&   rU   s     r(   re   zCategorizer._transform_pandas  s+    "$T\"2"9"9$+"F"F4	r)   c                     | j         | j        t          | dd           t          | d          r*| j        r#d | j                                        D             nd dS )Nrg   r   c                 N    i | ]"\  }}|t          |j                  |j        d #S )
categoriesordered)ry   r  r  )rN   r7   r   s      r(   r   z8Categorizer._get_serializable_fields.<locals>.<dictcomp>%  sC       C D)9$:$:u}UU  r)   )r    r   rg   r   )r    r   rh   hasattrr   r   ri   s    r(   rj   z$Categorizer._get_serializable_fields   s    |"1tY55
 tX&&	 ,0;	  "&+"3"3"5"5   
 

 

 
	
r)   rk   r   c                     |                     d          r$d |d                                         D             ni | _        |d         | _        |d         | _        |                     d          | _        d S )Nr   c                 Z    i | ](\  }}|t          j        |d          |d                   )S )r  r  r  )rb   r   )rN   r7   
dtype_datas      r(   r   z8Categorizer._set_serializable_fields.<locals>.<dictcomp>1  sQ        $C R(),7IAV    r)   r    r   rg   )rM   r   r   r    r   rg   rm   s      r(   rn   z$Categorizer._set_serializable_fields-  s     zz(##   (.h'7'='='?'?	     	 i($%56zz),,r)   c                 P    | j         j         d| j        d| j        d| j        dS )Nrp   z	, dtypes=rq   r4   )r'   rr   r    r   r   ri   s    r(   rs   zCategorizer.__repr__@  sT    ~& O O O OkO O6:6IO O O	
r)   )NN)rr   rt   ru   rv   r
   rw   r   r   rb   r   r$   r   rG   rz   re   r   rj   r{   rn   rs   r|   r}   s   @r(   r   r     s,       9 9| <@.2	
 
c
 c2#6678
 !c+	
 
 
 
 
 
 I ,    6BL    
$sCx. 
 
 
 
-tCH~ - - - - -&
 
 
 
 
 
 
r)   r   T)r   r   r*   r   r    r.   r   r   c           	         |i }t                    }|D ]}||vrt          d| d d          dt          j        dt          ffddt          j        dt          t          t          t                   f         ffd}|                     |d	
          }fdD             }	|	                    d           D ]}
|

                                D ]\  }}|D ]}d |
                                D             }||v r5t          t          |                              ||                             }|	 |                                       |                                           |	S )NzYou set `max_categories` for z, which is not present in .r7   r+   c                    t          |           rBr*t                      fd}|                     |           S |                     d           } t          |                     d                                                    S )Nc                 2                         |            | S r"   )update)rH   r   s    r(   update_counterz\compute_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.update_counter`  s    NN7+++"Nr)   c                      t          |           S r"   )rS   )rO   s    r(   r1   zVcompute_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.<lambda>h  s    a r)   F)dropna)r\   r   r]   value_countsto_dict)r7   r  r   r   s     @r(   get_pd_value_counts_per_columnzDcompute_unique_value_indices.<locals>.get_pd_value_counts_per_columnY  s     (,, 	2 2!))# # # # # ''' gg0011s''u'55==??@@@r)   rU   c                     | j                                         }i }D ]0}||v r | |                   g||<   t          d| d|           |S )NzColumn 'z2' does not exist in DataFrame, which has columns: )r    r   
ValueError)rU   
df_columnsresultr7   r    r  s       r(   get_pd_value_countsz9compute_unique_value_indices.<locals>.get_pd_value_countsk  s    Z&&((
 	 	Cj  ==bgFFGs bsbbV`bb   r)   pandas)r   c                 @    i | ]} |          t                      S r5   )set)rN   r7   r.   s     r(   r   z0compute_unique_value_indices.<locals>.<dictcomp>y  s'    +S+S+SCGGCLL#%%+S+S+Sr)   )
batch_sizec                     i | ]
\  }}|||S r"   r5   )rN   kr   s      r(   r   z0compute_unique_value_indices.<locals>.<dictcomp>}  s&     + + +!QAqr)   )r  r  rb   rc   r   rz   rw   r
   r   iter_batchesr   dictr   most_commonr  r   )r*   r    r.   r   r   columns_setr   r  value_counts_dsunique_values_by_colbatchr7   countersr   r  s    ```          @r(   r/   r/   G  s    g,,K   $$       %ABI A$ A A A A A A$ c4:o1F        ))*=H)UUO+S+S+S+S7+S+S+S --->> J J"[[]] 
	J 
	JMC# 	J 	J+ +%,]]__+ + + .((.2((44^C5HII/ /G %WWS\\299',,..IIII	J
	J  r)   Fr   r+   c                 T     dt           dt          t          t          f         f fd}|S )a  
    Returns a post-processing function that generates an encoding map by
    sorting the unique values produced during aggregation or stats computation.

    Args:
        drop_na_values: If True, NA/null values will be silently dropped from the
            encoding map. If False, raises an error if any NA/null values are present.

    Returns:
        A callable that takes a set of unique values and returns a dictionary
        mapping each value to a unique integer index.
    valuesr+   c                     t          d | D                       rst          d          d | D             }d t          t          |                    D             S )Nc              3   4   K   | ]}t          |          V  d S r"   r   rN   r   s     r(   	<genexpr>z:unique_post_fn.<locals>.gen_value_index.<locals>.<genexpr>  s(      **awqzz******r)   z]Unable to fit column because it contains null values. Consider imputing missing values first.c                 0    g | ]}t          |          |S r5   r   r+  s     r(   rQ   z;unique_post_fn.<locals>.gen_value_index.<locals>.<listcomp>  s#    ???GAJJ?1???r)   c                 b    i | ],\  }}t          |t                    s|nt          |          |-S r5   )r   ry   rS   )rN   ir   s      r(   r   z;unique_post_fn.<locals>.gen_value_index.<locals>.<dictcomp>  sI     
 
 
 1 !D))7QQuQxx!
 
 
r)   )anyr  	enumeratesorted)r(  non_null_valuesr   s     r(   gen_value_indexz'unique_post_fn.<locals>.gen_value_index  s     **6***** 	> 	C  
 @?f???
 
 "&"9"9::
 
 
 	
r)   )r
   r   r   r{   )r   r4  s   ` r(   rE   rE     s?    
 
c3h 
 
 
 
 
 
" r)   rU   c                 N      fd|D             }|rt          d| d          d S )Nc                 v    g | ]5}|                                          j                                        3|6S r5   )isnullr(  r0  )rN   r   rU   s     r(   rQ   z _validate_df.<locals>.<listcomp>  s=    UUUvBvJ4E4E4G4G4N4R4R4T4TUFUUUr)   zUnable to transform columns zJ because they contain null values. Consider imputing missing values first.)r  )rU   r    null_columnss   `  r(   ra   ra     sZ    UUUUUUUL 
D< D D D
 
 	

 
r)   seriesc                     t          d | D             d           }t          j        j                            | j                  o t          |t          t          j	        f          S )Nc              3      K   | ]}||V  	d S r"   r5   )rN   rH   s     r(   r,  z/_is_series_composed_of_lists.<locals>.<genexpr>  s'      >>W'*=*=*=*=*=>>r)   )
nextr  apitypesis_object_dtyper   r   ry   r   r   )r9  first_not_none_elements     r(   r\   r\     sa    !>>>>>  :++FL99 jrz 2? ? r)   )F)4loggingcollectionsr   	functoolsr   typingr   r   r   r   r	   r
   r   r   numpyr   r  rb   pandas.api.types"ray.air.util.data_batch_conversionr   ray.data._internal.utilr   ray.data.preprocessorr   r   r   ray.data.preprocessors.utilsr   &ray.data.preprocessors.version_supportr   ray.util.annotationsr   ray.data.datasetr   	getLoggerrr   loggerr   r   r   r   r   rw   rx   r{   r/   rE   rz   ra   rc   r\   r5   r)   r(   <module>rP     s}                T T T T T T T T T T T T T T T T T T T T             : : : : : : + + + + + +         
 = < < < < < K K K K K K * * * * * * )(((((( 
	8	$	$ W!0VWWWa
 a
 a
 a
 a
1 a
 a
 XW a
H W!0VWWWm
 m
 m
 m
 m
0 m
 m
 XW m
` WB  V
 V
 V
 V
 V
2 V
 V
  V
r W!0TUUU]v ]v ]v ]v ]v/ ]v ]v VU ]v@ W!0RSSSO
 O
 O
 O
 O
. O
 O
 TS O
n /3@  @  @ @  #Y@  	@ 
 @  T#s(^,@  @  @  @ F 4 HcUDcN=R4S    D
R\ 
S 
T 
 
 
 
 t      r)   