
    &`i#                        d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ erd d
lmZ  e j        e          Z  ed           edd           G d de                                  Z!ddde
e"         dee"ge"f         de	e"ee"ef         f         fdZ#dS )    NCounter)Number)TYPE_CHECKINGAnyCallableDictListOptionalUnion)is_categorical_dtype)Mean)SerializablePreprocessorBase)SerializablePreprocessor)	PublicAPI)Datasetalpha)	stability   z#io.ray.preprocessors.simple_imputer)version
identifierc                        e Zd ZdZg dZ	 	 ddddee         dedeeee	f                  d	eee                  f fd
Z
dddefdZdej        fdZd Zd Zdeeef         fdZdeeef         defdZ xZS )SimpleImputera  Replace missing values with imputed values. If the column is missing from a
    batch, it will be filled with the imputed value.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import SimpleImputer
        >>> df = pd.DataFrame({"X": [0, None, 3, 3], "Y": [None, "b", "c", "c"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  NaN     b
        2  3.0     c
        3  3.0     c

        The `"mean"` strategy imputes missing values with the mean of non-missing
        values. This strategy doesn't work with categorical data.

        >>> preprocessor = SimpleImputer(columns=["X"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  2.0     b
        2  3.0     c
        3  3.0     c

        The `"most_frequent"` strategy imputes missing values with the most frequent
        value in each column.

        >>> preprocessor = SimpleImputer(columns=["X", "Y"], strategy="most_frequent")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  c
        1  3.0  b
        2  3.0  c
        3  3.0  c

        The `"constant"` strategy imputes missing values with the value specified by
        `fill_value`.

        >>> preprocessor = SimpleImputer(
        ...     columns=["Y"],
        ...     strategy="constant",
        ...     fill_value="?",
        ... )
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  ?
        1  NaN  b
        2  3.0  c
        3  3.0  c

        :class:`SimpleImputer` can also be used in append mode by providing the
        name of the output_columns that should hold the imputed values.

        >>> preprocessor = SimpleImputer(columns=["X"], output_columns=["X_imputed"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y  X_imputed
        0  0.0  None        0.0
        1  NaN     b        2.0
        2  3.0     c        3.0
        3  3.0     c        3.0

    Args:
        columns: The columns to apply imputation to.
        strategy: How imputed values are chosen.

            * ``"mean"``: The mean of non-missing values. This strategy only works with numeric columns.
            * ``"most_frequent"``: The most common value.
            * ``"constant"``: The value passed to ``fill_value``.

        fill_value: The value to use when ``strategy`` is ``"constant"``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Raises:
        ValueError: if ``strategy`` is not ``"mean"``, ``"most_frequent"``, or
            ``"constant"``.
    )meanmost_frequentconstantr   N)output_columnscolumnsstrategy
fill_valuer   c                (   t                                                       || _        || _        || _        || j        vrt          d| d| j                   |dk    rd| _        |t          d          t          j	        ||          | _
        d S )N	Strategy z( is not supported.Supported values are: r   Fz8`fill_value` must be set when using "constant" strategy.)super__init__r   r   r    _valid_strategies
ValueError_is_fittabler   #_derive_and_validate_output_columnsr   )selfr   r   r    r   	__class__s        r/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/preprocessors/imputer.pyr$   zSimpleImputer.__init__p   s     	 $4111BH B B)-)?B B  
 z!! %D! N  
 )L  	    datasetr   returnc                       j         dk    r' j                            t           j                   n2 j         dk    r' j                             fdd  j                    S )Nr   )aggregator_fnr   r   c                 2    t          j        |           S )N)r-   r   key_gen)_get_most_frequent_valuesr   )r2   r-   r)   s    r+   <lambda>z$SimpleImputer._fit.<locals>.<lambda>   s"    (A# L#) ) ) r,   c                     d|  dS )Nmost_frequent() )cols    r+   r4   z$SimpleImputer._fit.<locals>.<lambda>   s    (?(?(?(? r,   )stat_fnstat_key_fnr   )r   stat_computation_planadd_aggregatorr   r   add_callable_stat)r)   r-   s   ``r+   _fitzSimpleImputer._fit   s    =F""&55"DL 6     ]o--&88    
 @? 9    r,   dfc                 (   t          | j        | j                  D ]\  }}|                     |          }|t	          d| d          ||j        vr|||<   >t          |j        |                   r$||         j                            |g          ||<   ||k    s<t          ||         j
        t          j                  r6||         j
        j        j        s||                             d          ||<   |                    ||id           |S )NzColumn zA has no fill value. Check the data used to fit the SimpleImputer.T)deep)inplace)zipr   r   _get_fill_valuer&   r   dtypescatadd_categories
isinstancevaluesnpndarrayflags	writeablecopyfillna)r)   r@   columnoutput_columnvalues        r+   _transform_pandaszSimpleImputer._transform_pandas   sC   %(t7J%K%K 	@ 	@!FM((00E} Df D D D  
 RZ''$)=!!'	&(9:: O(*6
(E(Eug(N(NB}% "V++ #2m#4#;RZHH	 ,
 !#= 1 8 > H , )+6
T(B(BB}%		=%0$	????	r,   c                     | j         dk    r| j        d| d         S | j         dk    r| j        d| d         S | j         dk    r| j        S t          d| j          d          )	Nr   zmean(r7   r   r6   r   r"   zA is not supported. Supported values are: {self._valid_strategies})r   stats_r    r&   )r)   rQ   s     r+   rE   zSimpleImputer._get_fill_value   s    =F"";0v00011]o--;9999::]j((?"ADM A A A  r,   c           
      `    | j         j         d| j        d| j        d| j        d| j        d
S )Nz	(columns=z, strategy=z, fill_value=z, output_columns=r7   )r*   __name__r   r   r    r   r)   s    r+   __repr__zSimpleImputer.__repr__   sZ    ~& 7 7 7 77 76:o7 7"17 7 7	
r,   c           	      l    | j         | j        t          | dd           | j        t          | dd           dS )N_fittedr    )r   r   r\   r   r    )r   r   getattrr   rY   s    r+   _get_serializable_fieldsz&SimpleImputer._get_serializable_fields   s?    |"1tY55!$d;;
 
 	
r,   fieldsr   c                     |d         | _         |d         | _        |d         | _        |                    d          | _        |                    d          | _        | j        dk    r	d| _        d S d S )Nr   r   r   r\   r    r   F)r   r   r   getr\   r    r'   )r)   r_   r   s      r+   _set_serializable_fieldsz&SimpleImputer._set_serializable_fields   sq    i($%56z*zz),, **\22=J&& %D '&r,   )r   N)rX   
__module____qualname____doc__r%   r
   strr   r   r   r$   r   r?   pd	DataFramerT   rE   rZ   r	   r   r^   intrb   __classcell__)r*   s   @r+   r   r      sb       Q Qf >==
 37	
 /3
 
 
c
 
 U3;/0	
 !c+
 
 
 
 
 
BI *F    $BL    @  
 
 

$sCx. 
 
 
 

&tCH~ 
& 
& 
& 
& 
& 
& 
& 
& 
&r,   r   r-   r   r   r2   r.   c                 h  	 dt           j        dt          t          t          t
                   f         ffd}|                     |d          }d D             	|                    d           D ]1}|                                D ]\  }}|D ]}	|xx         |z  cc<   2	fdD             S )	Nr@   r.   c                 "      fdD             S )Nc                     i | ]=}|t          |                                                                                   g>S r8   )r   value_countsto_dict).0r9   r@   s     r+   
<dictcomp>zJ_get_most_frequent_values.<locals>.get_pd_value_counts.<locals>.<dictcomp>   sB    TTTSgbg2244<<>>??@TTTr,   r8   )r@   r   s   `r+   get_pd_value_countsz6_get_most_frequent_values.<locals>.get_pd_value_counts   s    TTTTGTTTTr,   pandas)batch_formatc                 ,    i | ]}|t                      S r8   r   )rp   r9   s     r+   rq   z-_get_most_frequent_values.<locals>.<dictcomp>   s    888c799888r,   )
batch_sizec                 r    i | ]3} |          |                              d           d         d         4S )r   r   )most_common)rp   rQ   final_countersr2   s     r+   rq   z-_get_most_frequent_values.<locals>.<dictcomp>   sP        	/;;A>>qA!D  r,   )	rg   rh   r	   rf   r
   r   map_batchesiter_batchesitems)
r-   r   r2   rr   rn   batchr9   counterscounterry   s
    ``      @r+   r3   r3      s   
U Uc4=6H1I U U U U U U &&':&RRL88888N**d*;; / /"[[]] 	/ 	/MC# / /s###w.####/	/       r,   )$loggingcollectionsr   numbersr   typingr   r   r   r	   r
   r   r   numpyrK   rs   rg   pandas.api.typesr   ray.data.aggregater   ray.data.preprocessorr   &ray.data.preprocessors.version_supportr   Serializableray.util.annotationsr   ray.data.datasetr   	getLoggerrX   loggerr   rf   r3   r8   r,   r+   <module>r      s                L L L L L L L L L L L L L L L L L L         1 1 1 1 1 1 # # # # # # > > > > > >      + * * * * * )(((((( 
	8	$	$ Wa$IJJJP& P& P& P& P&0 P& P& KJ P&f#Y seSj! 
#uS&[!
!"	     r,   