
    &`i4I                     &   d dl mZmZmZmZmZmZ d dlZd dl	Z
d dlmZmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ erd dlmZ  ed	           ed
d           G d de                                  Z ed	           ed
d           G d de                                  Z ed	           ed
d           G d de                                  Z ed	           ed
d           G d de                                  ZdS )    )TYPE_CHECKINGAnyDictListOptionalTupleN)AbsMaxApproximateQuantileMaxMeanMinStd)PreprocessorSerializablePreprocessorBase)SerializablePreprocessor)	PublicAPI)Datasetalpha)	stability   z$io.ray.preprocessors.standard_scaler)version
identifierc                        e Zd ZdZddee         deee                  f fdZdddefd	Z	d
e
j        fdZdeeef         fdZdeeef         defdZd Z xZS )StandardScalerav	  Translate and scale each column by its mean and standard deviation,
    respectively.

    The general formula is given by

    .. math::

        x' = \frac{x - \bar{x}}{s}

    where :math:`x` is the column, :math:`x'` is the transformed column,
    :math:`\bar{x}` is the column average, and :math:`s` is the column's sample
    standard deviation. If :math:`s = 0` (i.e., the column is constant-valued),
    then the transformed column will contain zeros.

    .. warning::
        :class:`StandardScaler` works best when your data is normal. If your data isn't
        approximately normal, then the transformed features won't be meaningful.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import StandardScaler
        >>>
        >>> df = pd.DataFrame({"X1": [-2, 0, 2], "X2": [-3, -3, 3], "X3": [1, 1, 1]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
           X1  X2  X3
        0  -2  -3   1
        1   0  -3   1
        2   2   3   1

        Columns are scaled separately.

        >>> preprocessor = StandardScaler(columns=["X1", "X2"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
                 X1        X2  X3
        0 -1.224745 -0.707107   1
        1  0.000000 -0.707107   1
        2  1.224745  1.414214   1

        Constant-valued columns get filled with zeros.

        >>> preprocessor = StandardScaler(columns=["X3"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2   X3
        0  -2  -3  0.0
        1   0  -3  0.0
        2   2   3  0.0

        >>> preprocessor = StandardScaler(
        ...     columns=["X1", "X2"],
        ...     output_columns=["X1_scaled", "X2_scaled"]
        ... )
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2  X3  X1_scaled  X2_scaled
        0  -2  -3   1  -1.224745  -0.707107
        1   0  -3   1   0.000000  -0.707107
        2   2   3   1   1.224745   1.414214

    Args:
        columns: The columns to separately scale.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    Ncolumnsoutput_columnsc                     t                                                       || _        t          j        ||          | _        d S Nsuper__init__r   r   #_derive_and_validate_output_columnsr   selfr   r   	__class__s      q/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/preprocessors/scaler.pyr!   zStandardScaler.__init__U   @    *N^
 
    datasetr   returnc                     | j                             t          | j                   | j                             d | j                   | S )N)aggregator_fnr   c                 $    t          | d          S )Nr   )ddof)r   )cols    r&   <lambda>z%StandardScaler._fit.<locals>.<lambda>b   s    c#A&6&6&6 r(   )stat_computation_planadd_aggregatorr   r   )r$   r)   s     r&   _fitzStandardScaler._fit\   s^    "11L 	2 	
 	
 	
 	"1166L 	2 	
 	
 	
 r(   dfc                 |     dt           j        f fd}| j                                     |          | j        <   |S )Nsc                     j         d| j         d         }j         d| j         d         }||t          j        | d d <   | S |dk    rd}| |z
  |z  S )Nzmean()zstd(r   r   )stats_namenpnan)r6   s_means_stdr$   s      r&   column_standard_scalerz@StandardScaler._transform_pandas.<locals>.column_standard_scalerh   ss    [!2!2!2!23FK 0qv 0 0 01E}v!!! zzJ%''r(   pdSeriesr   	transformr   )r$   r4   r?   s   `  r&   _transform_pandasz StandardScaler._transform_pandasg   sT    	(bi 	( 	( 	( 	( 	( 	( #%T\"2"<"<=S"T"T4	r(   c                 @    | j         | j        t          | dd           dS N_fitted)r   r   rG   r   r   getattrr$   s    r&   _get_serializable_fieldsz'StandardScaler._get_serializable_fieldsz   +    |"1tY55
 
 	
r(   fieldsr   c                 n    |d         | _         |d         | _        |                    d          | _        d S Nr   r   rG   r   r   getrG   r$   rM   r   s      r&   _set_serializable_fieldsz'StandardScaler._set_serializable_fields   3    i($%56zz),,r(   c                 @    | j         j         d| j        d| j        dS N	(columns=z, output_columns=r8   r%   __name__r   r   rJ   s    r&   __repr__zStandardScaler.__repr__   ,    .)mmDLmmUYUhmmmmr(   r   rY   
__module____qualname____doc__r   strr   r!   r   r3   rA   	DataFramerD   r   r   rK   intrS   rZ   __classcell__r%   s   @r&   r   r      s       A AF
 
S	 
8DI;N 
 
 
 
 
 
	I 	, 	 	 	 	BL    &
$sCx. 
 
 
 
-tCH~ - - - - -n n n n n n nr(   r   z#io.ray.preprocessors.min_max_scalerc                        e Zd ZdZddee         deee                  f fdZdddefd	Z	d
e
j        fdZdeeef         fdZdeeef         defdZd Z xZS )MinMaxScalera  Scale each column by its range.

    The general formula is given by

    .. math::

        x' = \frac{x - \min(x)}{\max{x} - \min{x}}

    where :math:`x` is the column and :math:`x'` is the transformed column. If
    :math:`\max{x} - \min{x} = 0` (i.e., the column is constant-valued), then the
    transformed column will get filled with zeros.

    Transformed values are always in the range :math:`[0, 1]`.

    .. tip::
        This can be used as an alternative to :py:class:`StandardScaler`.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import MinMaxScaler
        >>>
        >>> df = pd.DataFrame({"X1": [-2, 0, 2], "X2": [-3, -3, 3], "X3": [1, 1, 1]})   # noqa: E501
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
           X1  X2  X3
        0  -2  -3   1
        1   0  -3   1
        2   2   3   1

        Columns are scaled separately.

        >>> preprocessor = MinMaxScaler(columns=["X1", "X2"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
            X1   X2  X3
        0  0.0  0.0   1
        1  0.5  0.0   1
        2  1.0  1.0   1

        Constant-valued columns get filled with zeros.

        >>> preprocessor = MinMaxScaler(columns=["X3"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2   X3
        0  -2  -3  0.0
        1   0  -3  0.0
        2   2   3  0.0

        >>> preprocessor = MinMaxScaler(columns=["X1", "X2"], output_columns=["X1_scaled", "X2_scaled"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2  X3  X1_scaled  X2_scaled
        0  -2  -3   1        0.0        0.0
        1   0  -3   1        0.5        0.0
        2   2   3   1        1.0        1.0

    Args:
        columns: The columns to separately scale.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    Nr   r   c                     t                                                       || _        t          j        ||          | _        d S r   r   r#   s      r&   r!   zMinMaxScaler.__init__   r'   r(   r)   r   r*   c                 Z      fdt           t          fD             } |j        |  _         S )Nc                 :    g | ]}j         D ]} ||          S  )r   ).0Aggr/   r$   s      r&   
<listcomp>z%MinMaxScaler._fit.<locals>.<listcomp>   s0    MMM3MMcc#hhMMMMr(   )r   r   	aggregater9   r$   r)   
aggregatess   `  r&   r3   zMinMaxScaler._fit   s6    MMMM3*MMM
'g'4r(   r4   c                 |     dt           j        f fd}| j                                     |          | j        <   |S )Nr6   c                     j         d| j         d         }j         d| j         d         }||z
  }|dk    rd}| |z
  |z  S )Nzmin(r8   zmax(r   r   r9   r:   )r6   s_mins_maxdiffr$   s       r&   column_min_max_scalerz=MinMaxScaler._transform_pandas.<locals>.column_min_max_scaler   s_    K 0qv 0 0 01EK 0qv 0 0 01E5=D qyyI%%r(   r@   )r$   r4   rw   s   `  r&   rD   zMinMaxScaler._transform_pandas   sT    
	&RY 
	& 
	& 
	& 
	& 
	& 
	& #%T\"2"<"<=R"S"S4	r(   c                 @    | j         | j        t          | dd           dS rF   rH   rJ   s    r&   rK   z%MinMaxScaler._get_serializable_fields   rL   r(   rM   r   c                 n    |d         | _         |d         | _        |                    d          | _        d S rO   rP   rR   s      r&   rS   z%MinMaxScaler._set_serializable_fields   rT   r(   c                 @    | j         j         d| j        d| j        dS rV   rX   rJ   s    r&   rZ   zMinMaxScaler.__repr__   r[   r(   r   r\   rd   s   @r&   rf   rf      s
       = =~
 
S	 
8DI;N 
 
 
 
 
 
I ,    
BL     
$sCx. 
 
 
 
-tCH~ - - - - -n n n n n n nr(   rf   z#io.ray.preprocessors.max_abs_scalerc                        e Zd ZdZddee         deee                  f fdZdddefd	Z	d
e
j        fdZdeeef         fdZdeeef         defdZd Z xZS )MaxAbsScalera@  Scale each column by its absolute max value.

    The general formula is given by

    .. math::

        x' = \frac{x}{\max{\vert x \vert}}

    where :math:`x` is the column and :math:`x'` is the transformed column. If
    :math:`\max{\vert x \vert} = 0` (i.e., the column contains all zeros), then the
    column is unmodified.

    .. tip::
        This is the recommended way to scale sparse data. If you data isn't sparse,
        you can use :class:`MinMaxScaler` or :class:`StandardScaler` instead.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import MaxAbsScaler
        >>>
        >>> df = pd.DataFrame({"X1": [-6, 3], "X2": [2, -4], "X3": [0, 0]})   # noqa: E501
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
           X1  X2  X3
        0  -6   2   0
        1   3  -4   0

        Columns are scaled separately.

        >>> preprocessor = MaxAbsScaler(columns=["X1", "X2"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
            X1   X2  X3
        0 -1.0  0.5   0
        1  0.5 -1.0   0

        Zero-valued columns aren't scaled.

        >>> preprocessor = MaxAbsScaler(columns=["X3"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2   X3
        0  -6   2  0.0
        1   3  -4  0.0

        >>> preprocessor = MaxAbsScaler(columns=["X1", "X2"], output_columns=["X1_scaled", "X2_scaled"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2  X3  X1_scaled  X2_scaled
        0  -2  -3   1       -1.0       -1.0
        1   0  -3   1        0.0       -1.0
        2   2   3   1        1.0        1.0

    Args:
        columns: The columns to separately scale.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    Nr   r   c                     t                                                       || _        t          j        ||          | _        d S r   r   r#   s      r&   r!   zMaxAbsScaler.__init__:  r'   r(   r)   r   r*   c                 F    d | j         D             } |j        | | _        | S )Nc                 ,    g | ]}t          |          S rj   )r	   )rk   r/   s     r&   rm   z%MaxAbsScaler._fit.<locals>.<listcomp>B  s    :::cfSkk:::r(   )r   rn   r9   ro   s      r&   r3   zMaxAbsScaler._fitA  s-    ::T\:::
'g'4r(   r4   c                 |     dt           j        f fd}| j                                     |          | j        <   |S )Nr6   c                 J    j         d| j         d         }|dk    rd}| |z  S )Nzabs_max(r8   r   r   rs   )r6   	s_abs_maxr$   s     r&   column_abs_max_scalerz=MaxAbsScaler._transform_pandas.<locals>.column_abs_max_scalerG  s7    $8qv$8$8$89I A~~	y= r(   r@   )r$   r4   r   s   `  r&   rD   zMaxAbsScaler._transform_pandasF  sT    	!RY 	! 	! 	! 	! 	! 	! #%T\"2"<"<=R"S"S4	r(   c                 @    | j         | j        t          | dd           dS rF   rH   rJ   s    r&   rK   z%MaxAbsScaler._get_serializable_fieldsT  rL   r(   rM   r   c                 n    |d         | _         |d         | _        |                    d          | _        d S rO   rP   rR   s      r&   rS   z%MaxAbsScaler._set_serializable_fields[  rT   r(   c                 @    | j         j         d| j        d| j        dS rV   rX   rJ   s    r&   rZ   zMaxAbsScaler.__repr__b  r[   r(   r   r\   rd   s   @r&   r|   r|      s
       9 9v
 
S	 
8DI;N 
 
 
 
 
 
I ,    
BL    
$sCx. 
 
 
 
-tCH~ - - - - -n n n n n n nr(   r|   z"io.ray.preprocessors.robust_scalerc            	            e Zd ZdZdZddefdee         deeef         de	ee                  de
f fd	Zd
ddefdZdej        fdZdeeef         fdZdeeef         de
fdZd Z xZS )RobustScaleraI  Scale and translate each column using approximate quantiles.

    The general formula is given by

    .. math::
        x' = \frac{x - \mu_{1/2}}{\mu_h - \mu_l}

    where :math:`x` is the column, :math:`x'` is the transformed column,
    :math:`\mu_{1/2}` is the column median. :math:`\mu_{h}` and :math:`\mu_{l}` are the
    high and low quantiles, respectively. By default, :math:`\mu_{h}` is the third
    quartile and :math:`\mu_{l}` is the first quartile.

    Internally, the `ApproximateQuantile` aggregator is used to calculate the
    approximate quantiles.

    .. tip::
        This scaler works well when your data contains many outliers.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import RobustScaler
        >>>
        >>> df = pd.DataFrame({
        ...     "X1": [1, 2, 3, 4, 5],
        ...     "X2": [13, 5, 14, 2, 8],
        ...     "X3": [1, 2, 2, 2, 3],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
           X1  X2  X3
        0   1  13   1
        1   2   5   2
        2   3  14   2
        3   4   2   2
        4   5   8   3

        :class:`RobustScaler` separately scales each column.

        >>> preprocessor = RobustScaler(columns=["X1", "X2"])
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
            X1     X2  X3
        0 -1.0  0.625   1
        1 -0.5 -0.375   2
        2  0.0  0.750   2
        3  0.5 -0.750   2
        4  1.0  0.000   3

        >>> preprocessor = RobustScaler(
        ...    columns=["X1", "X2"],
        ...    output_columns=["X1_scaled", "X2_scaled"]
        ... )
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
           X1  X2  X3  X1_scaled  X2_scaled
        0   1  13   1       -1.0      0.625
        1   2   5   2       -0.5     -0.375
        2   3  14   2        0.0      0.750
        3   4   2   2        0.5     -0.750
        4   5   8   3        1.0      0.000

    Args:
        columns: The columns to separately scale.
        quantile_range: A tuple that defines the lower and upper quantiles. Values
            must be between 0 and 1. Defaults to the 1st and 3rd quartiles:
            ``(0.25, 0.75)``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
        quantile_precision: Controls the accuracy and memory footprint of the sketch (K in KLL);
            higher values yield lower error but use more memory. Defaults to 800. See
            https://datasketches.apache.org/docs/KLL/KLLAccuracyAndSize.html
            for details on accuracy and size.
    i   )g      ?g      ?Nr   quantile_ranger   quantile_precisionc                     t                                                       || _        || _        || _        t          j        ||          | _        d S r   )r    r!   r   r   r   r   r"   r   )r$   r   r   r   r   r%   s        r&   r!   zRobustScaler.__init__  sR     	,"4*N^
 
r(   r)   r   r*   c                      j         d         d j         d         g fd j        D             } |j        | }i  _         j        D ]<}|d| d         \  }}}| j        d| d<   | j        d| d<   | j        d	| d<   = S )
Nr   g      ?r   c                 >    g | ]}t          |j                   S ))on	quantilesr   )r
   r   )rk   r/   r   r$   s     r&   rm   z%RobustScaler._fit.<locals>.<listcomp>  sF     
 
 
   ##'#:  
 
 
r(   zapprox_quantile(r8   low_quantile(median(high_quantile()r   r   rn   r9   )	r$   r)   rp   
aggregatedr/   low_qmed_qhigh_qr   s	   `       @r&   r3   zRobustScaler._fit  s    ""
	

 
 
 
 
 |
 
 

 'W&
3
< 	: 	:C#-.G.G.G.G#H E5&27DK..../,1DK(#((()39DK////00r(   r4   c                 |     dt           j        f fd}| j                                     |          | j        <   |S )Nr6   c                     j         d| j         d         }j         d| j         d         }j         d| j         d         }||z
  }|dk    rt          j        |           S | |z
  |z  S )Nr   r8   r   r   r   )r9   r:   r;   
zeros_like)r6   s_low_qs_medians_high_qrv   r$   s        r&   column_robust_scalerz<RobustScaler._transform_pandas.<locals>.column_robust_scaler  s    k";!&";";";<G{#6QV#6#6#67H{#=AF#=#=#=>Hg%D qyy}Q'''LD((r(   r@   )r$   r4   r   s   `  r&   rD   zRobustScaler._transform_pandas  sT    	)BI 	) 	) 	) 	) 	) 	) #%T\"2"<"<=Q"R"R4	r(   c           	      X    | j         | j        | j        | j        t	          | dd           dS )NrG   )r   r   r   r   rG   )r   r   r   r   rI   rJ   s    r&   rK   z%RobustScaler._get_serializable_fields  s7    |"1"1"&"9tY55
 
 	
r(   rM   r   c                     |d         | _         |d         | _        |d         | _        |d         | _        |                    d          | _        d S )Nr   r   r   r   rG   )r   r   r   r   rQ   rG   rR   s      r&   rS   z%RobustScaler._set_serializable_fields  sO    i($%56$%56"()=">zz),,r(   c                 P    | j         j         d| j        d| j        d| j        dS )NrW   z, quantile_range=z), output_columns=r8   )r%   rY   r   r   r   rJ   s    r&   rZ   zRobustScaler.__repr__  sL    ~& 7 7 7 7"17 7"17 7 7	
r(   )rY   r]   r^   r_   DEFAULT_QUANTILE_PRECISIONr   r`   r   floatr   rb   r!   r   r3   rA   ra   rD   r   r   rK   rS   rZ   rc   rd   s   @r&   r   r   f  s9       I IV "%
 /;.2"<
 
c
 eUl+
 !c+	

  
 
 
 
 
 
 I ,    2BL    "
$sCx. 
 
 
 
-tCH~ - - - - -
 
 
 
 
 
 
r(   r   )typingr   r   r   r   r   r   numpyr;   pandasrA   ray.data.aggregater	   r
   r   r   r   r   ray.data.preprocessorr   r   &ray.data.preprocessors.version_supportr   ray.util.annotationsr   ray.data.datasetr   r   rf   r|   r   rj   r(   r&   <module>r      s   B B B B B B B B B B B B B B B B         O O O O O O O O O O O O O O O O L L L L L L L L K K K K K K * * * * * * )(((((( W!0VWWWxn xn xn xn xn1 xn xn XW xnv W!0UVVVkn kn kn kn kn/ kn kn WV kn\ W!0UVVVen en en en en/ en en WV enP W!0TUUU_
 _
 _
 _
 _
/ _
 _
 VU _
 _
 _
r(   