
    &`iT                        d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlmZ erd dlm Z  d dl!m"Z"m#Z#  e j$        e%          Z& ed	
          e G d d                                  Z'e G d d                      Z(de)dee         fdZ*de)dee         fdZ+de)dee         fdZ,deed         ee)gee         f         f         fdZ-de)dddee         fdZ.de)dddeed         ee)gee         f         f         dee         fdZ/	 	 d0de	d         de	ee)                  de	eed         ee)gee         f         f                  de(fdZ0deded ej"        dee)e
eej"        f         f         fd!Z1d"ee)e2f         d#ej         d$ej         d%ee         de3f
d&Z4	 d1d(ed)e	ej"                 d*e)dej5        fd+Z6d,ee)ee)e3f         f         d-e7d#ej         d.e8dej9        f
d/Z:dS )2    N)	dataclass)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)convert_to_pyarrow_array)
AggregateFnV2ApproximateQuantileApproximateTopKCountMaxMeanMinMissingValuePercentageStdZeroPercentage)	PublicAPI)SchemaDataTypeTypeCategoryalpha)	stabilityc                       e Zd ZU dZdZej        ed<   ej        ed<   ej        ed<   e	e
         ed<   dej        fdZd	ej        d
ej        fdZd Zdej        de
d
ee         fdZde
fdZdS )DatasetSummaryzWrapper for dataset summary statistics.

    Provides methods to access computed statistics.

    Attributes:
        dataset_schema: PyArrow schema of the original dataset
    	statistic_stats_matching_column_dtype_stats_mismatching_column_dtypedataset_schemacolumnstablec                 j   ddl m} 	 |                    |                                          S # t          t
          t          j        f$ r}t          	                    d| d           i }|j
        j        D ]}|                    |          }	 |                                ||<   0# t          t
          t          j        f$ rN t          j        t          |          t          j                              }|                                ||<   Y w xY wt!          j        |          cY d}~S d}~ww xY w)zSafely convert a PyArrow table to pandas, handling problematic extension types.

        Args:
            table: PyArrow table to convert

        Returns:
            pandas DataFrame with converted data
        r   )BlockAccessorz$Direct conversion to pandas failed (z)), attempting column-by-column conversiontypeN)ray.data.blockr'   	for_block	to_pandas	TypeError
ValueErrorpaArrowInvalidloggerwarningschemanamescolumnnullslennullpd	DataFrame)selfr%   r'   eresult_datacol_namecolnull_cols           b/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/stats.py_safe_convert_tablez"DatasetSummary._safe_convert_table:   s]    	100000	- **511;;===:r7 	- 	- 	-NN9q 9 9 9  
 K!L. A All8,,A,/MMOOK))!:r? A A A!xCrwyyAAAH,4,>,>,@,@K)))A
 <,,,,,,,,!	-sA   &/ D2AD-B'&D-'A'DD-DD-'D2-D2dfreturnc                 x    | j         |j        v r|                    | j                   S t          j                    S )zSet the statistic column as index if it exists, else return empty DataFrame.

        Args:
            df: DataFrame to set index on

        Returns:
            DataFrame with statistic column as index, or empty DataFrame if column missing
        )STATISTIC_COLUMNr$   	set_indexr9   r:   )r;   rC   s     rA   _set_statistic_indexz#DatasetSummary._set_statistic_indexY   s5      BJ..<< 5666|~~    c                    |                      |                     | j                            }|                      |                     | j                            }|j        r"|j        rt          j        | j        g          S |                    |          }|	                                
                    | j                  	                    d          S )a  Convert summary to a single pandas DataFrame.

        Combines statistics from both schema-matching and schema-changing tables.

        Note: Some PyArrow extension types (like TensorExtensionType) may fail to convert
        to pandas when all values in a column are None. In such cases, this method
        attempts to convert column-by-column, casting problematic columns to null type.

        Returns:
            DataFrame with all statistics, where rows are unique statistics from both tables
        r$   Tdrop)rH   rB   r!   r"   emptyr9   r:   rF   combine_firstreset_indexsort_values)r;   df_matchingdf_changingresults       rA   r,   zDatasetSummary.to_pandasf   s     //$$T%FGG
 
 //$$T%IJJ
 

  	A!2 	A<)>(?@@@@ **;77   [.//[d[##	
rI   r5   c                     ||j         j        vrdS |                     |          | j        |g         }|                    |di          S )a  Extract a column from a PyArrow table if it exists.

        Args:
            table: PyArrow table to extract from
            column: Column name to extract

        Returns:
            DataFrame with 'statistic' and 'value' columns, or None if column doesn't exist
        NvaluerK   )r3   r4   rB   rF   rename)r;   r%   r5   rC   s       rA   _extract_column_from_tablez)DatasetSummary._extract_column_from_table   sR     +++4%%e,,d.CV-LMyy&'!2y333rI   c                      fd j          j        fD             }|st          d d          t          j        |d          }d }|                     j        d          d	                             |                                          	                     j                                      d
          }|S )zGet all statistics for a specific column, merging from both tables.

        Args:
            column: Column name to get statistics for

        Returns:
            DataFrame with all statistics for the column
        c                 D    g | ]}                     |          xS )N)rX   ).0r%   r5   rC   r;   s     rA   
<listcomp>z3DatasetSummary.get_column_stats.<locals>.<listcomp>   sC     
 
 
 55eVDDDQ 
 RQQrI   zColumn 'z' not found in summary tablesT)ignore_indexc                 n    |                                  }t          |          dk    r|j        d         nd S )Nr   )dropnar7   iloc)seriesnon_nulls     rA   first_non_nullz7DatasetSummary.get_column_stats.<locals>.first_non_null   s1    }}H'*8}}q'8'88=##dBrI   F)sortrV   rL   )
r!   r"   r.   r9   concatgroupbyrF   applyrP   rQ   )r;   r5   dfscombinedrc   rT   rC   s   ``    @rA   get_column_statszDatasetSummary.get_column_stats   s    
 
 
 
 
 
 14
 
 
  	OMMMMNNN 9St444	C 	C 	C
 T2??HU>""[]][.//[d[## 	 rI   N)__name__
__module____qualname____doc__rF   r/   Table__annotations__r   liststrrB   r9   r:   rH   r,   r	   dictrX   rj    rI   rA   r   r      s          # #%(***%'X---I#Y- - - - ->r|     
 
 
@4X4'*4	$4 4 4 4$%s % % % % % %rI   r   c                   D    e Zd ZU dZeeef         ed<   ee         ed<   dS )_DtypeAggregatorszContainer for columns and their aggregators.

    Attributes:
        column_to_dtype: Mapping from column name to dtype string representation
        aggregators: List of all aggregators to apply
    column_to_dtypeaggregatorsN)	rk   rl   rm   rn   r   rr   rp   r   r   rt   rI   rA   rv   rv      sC           #s(^###m$$$$$$rI   rv   r5   rD   c                    t          | d          t          | d          t          | d          t          | d          t	          | dd          t          | dg          t          |           t          | d          gS )	a  Generate default metrics for numerical columns.

    This function returns a list of aggregators that compute the following metrics:
    - count
    - mean
    - min
    - max
    - std
    - approximate_quantile (median)
    - missing_value_percentage
    - zero_percentage

    Args:
        column: The name of the numerical column to compute metrics for.

    Returns:
        A list of AggregateFnV2 instances that can be used with Dataset.aggregate()
    Fonignore_nullsTr   )r{   r|   ddofg      ?)r{   	quantilesr{   )r   r   r   r   r   r   r   r   r5   s    rA   _numerical_aggregatorsr      s    ( 	e,,,T***vD)))vD)))vDq111v#777&)))&t444	 	rI   c                     t          | d          t          | d          t          | d          t          |           gS )a|  Generate default metrics for temporal columns.

    This function returns a list of aggregators that compute the following metrics:
    - count
    - min
    - max
    - missing_value_percentage

    Args:
        column: The name of the temporal column to compute metrics for.

    Returns:
        A list of AggregateFnV2 instances that can be used with Dataset.aggregate()
    Frz   Tr   )r   r   r   r   r   s    rA   _temporal_aggregatorsr      sL      	e,,,vD)))vD)))&)))	 rI   c                 d    t          | d          t          |           t          | d          gS )a  Generate default metrics for all columns.

    This function returns a list of aggregators that compute the following metrics:
    - count
    - missing_value_percentage
    - approximate_top_k (top 10 most frequent values)

    Args:
        column: The name of the column to compute metrics for.

    Returns:
        A list of AggregateFnV2 instances that can be used with Dataset.aggregate()
    Frz   r   
   )r{   k)r   r   r   r   s    rA   _basic_aggregatorsr     s<     	e,,,&)))6R((( rI   c                  6   ddl m} m}  | j                    t           | j                    t           | j                    t           | j                    t           | j                    t           | j	                    t           | j
                    t           | j                    t           | j                    t           | j                    t           | j                    t           | j                    t            | j                    t           |j        t&          iS )a  Get default mapping from Ray Data DataType to aggregator factory functions.

    This function returns factory functions that create aggregators for specific columns.

    Returns:
        Dict mapping DataType or TypeCategory to factory functions that take a column name
        and return a list of aggregators for that column.

    Examples:
        >>> from ray.data.datatype import DataType
        >>> from ray.data.stats import _default_dtype_aggregators
        >>> mapping = _default_dtype_aggregators()
        >>> factory = mapping.get(DataType.int32())
        >>> aggs = factory("my_column")  # Creates aggregators for "my_column"
    r   r   )ray.data.datatyper   r   int8r   int16int32int64uint8uint16uint32uint64float32float64boolstringr   binaryTEMPORALr   r   s     rA   _default_dtype_aggregatorsr     s    $ 98888888
 	/000011122/--4# rI   dtyper   c           
         	 |                                 r6t          j                            |j                  rt          | d          gS |                                rt          |           S |                                rt          |           S t          |           S # t          $ r=}t                              d|  d| d| d           t          |           cY d}~S d}~ww xY w)a!  Get aggregators using heuristic-based type detection.

    This is a fallback when no explicit mapping is found for the dtype.

    Args:
        column: Column name
        dtype: Ray Data DataType for the column

    Returns:
        List of aggregators suitable for the column type
    Frz   z,Could not determine aggregators for column 'z' with dtype z: z. Using basic aggregators.N)is_arrow_typer/   typesis_null_physical_dtyper   is_numerical_typer   is_temporal_typer   r   	Exceptionr1   r2   )r5   r   r<   s      rA   _get_fallback_aggregatorsr   D  s#   *   	.RX%5%5e6K%L%L 	.V%88899$$&& 	.)&111##%% 	.(000 &f--- * * *(6 ( (PU ( (YZ ( ( (	
 	
 	
 "&))))))))*s0   A	B! "B! /"B! B! !
C(+2C#C(#C(dtype_agg_mappingc                    ddl m}m} |                                D ]a\  }}t	          ||          r||k    r ||           c S t	          ||t
          f          r"|                    |          r ||           c S bt          | |          S )a  Get aggregators for a specific column based on its DataType.

    Attempts to match the dtype against the provided mapping first, then
    falls back to heuristic-based selection if no match is found.

    Args:
        column: Column name
        dtype: Ray Data DataType for the column
        dtype_agg_mapping: Mapping from DataType to factory functions

    Returns:
        List of aggregators with the column name properly set
    r   r   )r   r   r   items
isinstancerr   is_ofr   )r5   r   r   r   r   mapping_keyfactorys          rA   _get_aggregators_for_dtyper   d  s    ( 98888888 !2 7 7 9 9 # #Wk8,, 	#+1E1E76??"""lC%899 	#ekk+>V>V 	#76??""" %VU333rI   r3   r   r$   c                    ddl m} | st          d          || j        }t	          |          t	          | j                  z
  }|rt          d| d          t                      }|r8|                                }|                                D ]\  }}||vr|||<   n|}i }	g }
t          t          | j        | j
                            }|D ]z}||         }|	|t          u rt                              d| d           4 |j        |          }t          |          |	|<   |
                    t#          |||                     {t%          |	|
	          S )
a  Generate aggregators for columns in a dataset based on their DataTypes.

    Args:
        schema: A Ray Schema instance
        columns: List of columns to include. If None, all columns will be included.
        dtype_agg_mapping: Optional user-provided mapping from DataType to aggregator factories.
            Each value should be a callable that takes a column name and returns aggregators.
            This will be merged with the default mapping (user mapping takes precedence).

    Returns:
        _DtypeAggregators containing column-to-dtype mapping and aggregators

    Raises:
        ValueError: If schema is None or if specified columns don't exist in schema
    r   r   z4Dataset must have a schema to determine column typesNzColumns z not found in dataset schemazSkipping field 'z': type is None or unsupported)rw   rx   )r   r   r.   r4   setr   copyr   rs   zipr   objectr1   r2   
from_arrowrr   extendr   rv   )r3   r$   r   r   missing_colsdefaultsfinal_mappingr   vrw   all_aggsname_to_typenamepa_type	ray_dtypes                  rA   _dtype_aggregators_for_datasetr     s   , +***** QOPPP, w<<#fl"3"33L PNLNNNOOO *++H !)..00NN$$ 	% 	%DAq%%#$a 	% ! OHFL&,7788L T Tt$?g//NNRdRRRSSS'H'00	 #I24MRRSSSS'   rI   aggrV   agg_typec                    ddl m} |                                 t          j                            |          p! |j        |                                          }t          |t                    s|s|rq |j        |                                          r|j
        n||nCd t          t          |                    D             }fdt          ||          D             S ||fiS )a  Format aggregation result into stat entries.

    Takes the raw aggregation result and formats it into one or more stat
    entries. For scalar results, returns a single entry. For list results,
    expands into multiple indexed entries.

    Args:
        agg: The aggregator instance
        value: The aggregation result value
        agg_type: PyArrow type of the aggregation result

    Returns:
        Dictionary mapping stat names to (value, type) tuples
    r   r   Nc                 ,    g | ]}t          |          S rt   )rr   )r[   idxs     rA   r\   z!_format_stats.<locals>.<listcomp>  s    <<<3c#hh<<<rI   c                 ,    i | ]\  }} d | d|fS )[]rt   )r[   labellist_valagg_namescalar_types      rA   
<dictcomp>z!_format_stats.<locals>.<dictcomp>  sE       #E8 &&e&&&;(?  rI   )r   r   get_agg_namer/   r   is_listr   is_list_typer   rq   
value_typeranger7   r   )r   rV   r   r   r   labelsr   r   s         @@rA   _format_statsr     s(   " +*****!!H 	""R&9h&9(&C&C&P&P&R&R  % 5=\= #x"8,,99;;H 	
 =<<%E

*;*;<<<F    '*65'9'9    uh'((rI   
agg_resultoriginal_schema
agg_schemarx   c                 F   i }i }t                      }d |D             }|                                 D ]\  }}	d|vs|                    d          s|                    |          }
|
s7|
                                }|sN|                    |          j        }|                    |          j        }t          |
|	|          }|                                D ]-\  }\  }}||k    r|n|}||f|                    |i           |<   .|	                    |           |||fS )a  Parse aggregation results into schema-matching and schema-changing stats.

    Args:
        agg_result: Dictionary of aggregation results with keys like "count(col)"
        original_schema: Original dataset schema
        agg_schema: Schema of aggregation results
        aggregators: List of aggregators used to generate the results

    Returns:
        Tuple of (schema_matching_stats, schema_changing_stats, column_names)
    c                     i | ]
}|j         |S rt   )r   )r[   r   s     rA   r   z(_parse_summary_stats.<locals>.<dictcomp>  s    777C#(C777rI   ())
r   r   endswithgetget_target_columnfieldr)   r   
setdefaultadd)r   r   r   rx   schema_matchingschema_changingr$   
agg_lookupkeyrV   r   r>   r   original_typeformatted_stats	stat_name
stat_value	stat_type
stats_dicts                      rA   _parse_summary_statsr     sf   " OOeeG 87;777J &&((  
Uc>>c!2!2> nnS!! 	((** 	 ##C((-'--h77<'UH==2A2G2G2I2I 	U 	U.I.
I $-#=#=?  ?I)=TJ!!)R00::HOW44rI    col_datacol_typer>   c                     |8	 t          j        | |          S # t           j        t           j        f$ r Y nw xY wt	          | |pd          S )a{  Create a PyArrow array with fallback strategies.

    Uses convert_to_pyarrow_array from arrow_block.py for type inference and
    error handling when no specific type is provided.

    Args:
        col_data: List of column values
        col_type: Optional PyArrow type to use
        col_name: Column name for error messages (optional)

    Returns:
        PyArrow array
    Nr(   r5   )r/   arrayArrowTypeErrorr0   r   )r   r   r>   s      rA   _create_pyarrow_arrayr   /  se      	8H84444!2?3 	 	 	D	 $Hh.B(CCCs    88r   all_columnspreserve_typesc                    | st          j        i           S t          |                                           }t          j        |i}t          |          D ]}g }d}|D ]L}	|| |	         v r+| |	         |         \  }
}|                    |
           ||}7|                    d           M|r$||j        v r|                    |          j	        }n|}t          |||          ||<   t          j        |          S )as  Build a PyArrow table from parsed statistics.

    Args:
        stats_dict: Nested dict of {stat_name: {col_name: (value, type)}}
        all_columns: Set of all column names across both tables
        original_schema: Original dataset schema
        preserve_types: If True, use original schema types for columns

    Returns:
        PyArrow table with statistics
    N)r/   r%   sortedkeysr   rF   appendr4   r   r)   r   )r   r   r   r   
stat_names
table_datar>   r   
first_typer   rV   r   r   s                rA   _build_summary_tabler   K  s#   "  x||
))**J 1:>J;'' S S
# 	& 	&I:i000",Y"7"Ax&&&%!)J%%%%  	"h/*???&,,X66;HH!H4XxRR
88JrI   )NN)Nr   );loggingdataclassesr   typingr   r   r   r   r   r	   r
   r   pandasr9   pyarrowr/   $ray.air.util.tensor_extensions.arrowr   ray.data.aggregater   r   r   r   r   r   r   r   r   r   ray.util.annotationsr   ray.data.datasetr   r   r   r   	getLoggerrk   r1   r   rv   rr   r   r   r   r   r   r   r   r   anytupler   Arrayr   r   r   ro   r   rt   rI   rA   <module>r     s    ! ! ! ! ! ! S S S S S S S S S S S S S S S S S S S S         I I I I I I                        + * * * * * 9''''''88888888 
	8	$	$ W
\ \ \ \ \ \ \  \~ 	% 	% 	% 	% 	% 	% 	% 	%3 4+>    ># $}*=    .s tM':    *)D	
$%xtM7J0J'KK% ) ) ) )X*c ** *mAT * * * *@444 ()8SE4;N4N+OO4 
-4 4 4 4F $( 	@ @X@d3i @  U-.#]@S9S0TTU@ @ @ @ @F-)	-)"-).0k-)	#uS"+%&
&'-) -) -) -)`45S#X45Y45 	45 m$	45
 45 45 45 45p MOD DD&r{3DFIDXD D D D8- S$sEz**+- -  Y-  	- 
 X-  -  -  -  -  - rI   