
    Pi["                         d dl mZ d dlmZmZmZ d dlZd dlm	Z
 d dlmZ d dlZd dlmZ d dlmZ ej        j                            e          Ze G d dej                              Z G d d	ej                  ZdS )
    )	dataclass)LiteralOptionalUnionN)Key)
table_castc                        e Zd ZU dZdZee         ed<   dZee	e
                  ed<   dZeej                 ed<   dZeeej        e	e         e	e	e                  f                  ed<   dZeej                 ed<   dZed	         ed
<    fdZ xZS )ParquetConfiga	  
    BuilderConfig for Parquet.

    Args:
        batch_size (`int`, *optional*):
            Size of the RecordBatches to iterate on.
            The default is the row group size (defined by the first row group).
        columns (`list[str]`, *optional*)
            List of columns to load, the other ones are ignored.
            All columns are loaded by default.
        features: (`Features`, *optional*):
            Cast the data to `features`.
        filters (`Union[pyarrow.dataset.Expression, list[tuple], list[list[tuple]]]`, *optional*):
            Return only the rows matching the filter.
            If possible the predicate will be pushed down to exploit the partition information
            or internal metadata found in the data source, e.g. Parquet statistics.
            Otherwise filters the loaded RecordBatches before yielding them.
        fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`, *optional*)
            Scan-specific options for Parquet fragments.
            This is especially useful to configure buffering and caching.

            <Added version="4.2.0"/>
        on_bad_files (`Literal["error", "warn", "skip"]`, *optional*, defaults to "error")
            Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are :
            * 'error', raise an Exception when a bad file is encountered.
            * 'warn', raise a warning when a bad file is encountered and skip that file.
            * 'skip', skip bad files without raising or warning when they are encountered.

            <Added version="4.2.0"/>

    Example:

    Load a subset of columns:

    ```python
    >>> ds = load_dataset(parquet_dataset_id, columns=["col_0", "col_1"])
    ```

    Stream data and efficiently filter data, possibly skipping entire files or row groups:

    ```python
    >>> filters = [("col_0", "==", 0)]
    >>> ds = load_dataset(parquet_dataset_id, streaming=True, filters=filters)
    ```

    Increase the minimum request size when streaming from 32MiB (default) to 128MiB and enable prefetching:

    ```python
    >>> import pyarrow
    >>> import pyarrow.dataset
    >>> fragment_scan_options = pyarrow.dataset.ParquetFragmentScanOptions(
    ...     cache_options=pyarrow.CacheOptions(
    ...         prefetch_limit=1,
    ...         range_size_limit=128 << 20
    ...     ),
    ... )
    >>> ds = load_dataset(parquet_dataset_id, streaming=True, fragment_scan_options=fragment_scan_options)
    ```

    N
batch_sizecolumnsfeaturesfiltersfragment_scan_optionserror)r   warnskipon_bad_filesc                 H    t                                                       d S N)super__post_init__)self	__class__s    }/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/packaged_modules/parquet/parquet.pyr   zParquetConfig.__post_init__V   s        )__name__
__module____qualname____doc__r   r   int__annotations__r   liststrr   datasetsFeaturesr   r   ds
Expressiontupler   ParquetFragmentScanOptionsr   r   r   __classcell__)r   s   @r   r
   r
      s         ; ;z !%J$$$#'GXd3i ''',0Hhx()000NRGXeBM4;T%[8IIJKRRREI8B$ABIII5<L'12<<<                 r   r
   c                   N    e Zd ZeZd Zd Zdej        dej        fdZ	d Z
d ZdS )	Parquetc                 *   | j         j        i| j         j        ]t          | j         j                  t          | j         j                  k    r)t	          d| j         j         d| j         j                   t          j        | j         j                  S )NzIThe columns and features argument must contain the same columns, but got z and )r   )configr   r   set
ValueErrorr$   DatasetInfo)r   s    r   _infozParquet._info]   s    K+$0DK'((C0D,E,EEE[;&CCT[-ACC   #T[-ABBBBr   c                 N     j         j        st          d j         j                   d|j        _        |                     j         j                  }g }|                                D ]\  }} j        j        L|D ]H}	 t          |d          5 }t          j                            t          j        |                     j        _        	 ddd            n# 1 swxY w Y   j# t          j        $ r} j         j        dk    r6t$                              d| dt)          |          j         d|              j         j        d	k    r7t$                              d
| dt)          |          j         d| d           n6t$                              d
| dt)          |          j         d| d           Y d}~Bd}~ww xY w j        j        t          d j         j                   |                    t          j        |d|i                      j         j        zt7           j         j                  t7           j        j                  k    rFt          j         fd j        j                                        D                        j        _        |S )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=TNrbr   zFailed to read schema from '' with error : r   zSkipping bad schema from ''. `zPAt least one valid data file must be specified, all the data_files are invalid: files)name
gen_kwargsc                 8    i | ]\  }}|j         j        v ||S  )r.   r   ).0colfeatr   s      r   
<dictcomp>z-Parquet._split_generators.<locals>.<dictcomp>   s0    eeeysD#QUQ\QdJdJddJdJdJdr   )r.   
data_filesr0   download_configextract_on_the_flydownloaditemsinfor   openr$   r%   from_arrow_schemapqread_schemapaArrowInvalidr   loggerr   typer   warningdebugappendSplitGeneratorr   r/   )	r   
dl_managerrB   splits
split_namer9   filefes	   `        r   _split_generatorszParquet._split_generatorsi   s   {% 	wu]a]h]suuvvv8<
"5(()?@@
!+!1!1!3!3 	a 	aJy!)! i iDi!$-- "191B1T1TUWUcdeUfUf1g1gDI.!" " " " " " " " " " " " " " " " " ? i i i;3w>>"LL)r)r)r[_`a[b[b[k)r)rop)r)rsss!![5??"NN+i+i+iQUVWQXQXQa+i+ief+i+i+ijjjj"LL)gd)g)gtTUwwO_)g)gcd)g)g)ghhhi y!) gkgrg}   MM(1zwX]N^___````;*s4;3F/G/G3tyOaKbKb/b/b!)!2eeeeDI,>,D,D,F,Feee" "DI s=   C*=CC*C""C*%C"&C**G9CGGpa_tablereturnc                 \    | j         j        t          || j         j        j                  }|S r   )rG   r   r   arrow_schema)r   r[   s     r   _cast_tablezParquet._cast_table   s+    9) "(DI,>,KLLHr   c              #      K   |E d {V  d S r   r=   )r   r9   s     r   _generate_shardszParquet._generate_shards   s$      r   c              #     K   | j         j        y| j         j        mt          d | j        j        j        D                       t          | j         j                  k    r*t          d| j         j         d| j        j         d          t          | j         j        t                    rt          j        | j         j                  n| j         j        }t          j        | j         j                  }t          |          D ]\  }}	 t!          |d          5 }|                    |          }|j        r| j         j        p|j        d         j        }t          |                    || j         j        |dd                    D ]L\  }	}
t,          j                            |
g          }t3          ||	          |                     |          fV  Md d d            n# 1 swxY w Y   # t,          j        t          f$ r}| j         j        d	k    r6t:                              d
| dt?          |          j          d|             | j         j        dk    r7t:          !                    d| dt?          |          j          d| d           n6t:          "                    d| dt?          |          j          d| d           Y d }~d }~ww xY wd S )Nc              3   $   K   | ]}|j         V  d S r   )r:   )r>   fields     r   	<genexpr>z+Parquet._generate_tables.<locals>.<genexpr>   s$      NNUejNNNNNNr   z)Tried to load parquet data with columns 'z' with mismatching features '')default_fragment_scan_optionsr4   r   )r   r   filterbatch_readaheadfragment_readaheadr   zFailed to read file 'r5   r6   r   zSkipping bad file 'r7   r8   )#r.   r   r   sortedrG   r^   r0   
isinstancer   r"   rJ   filters_to_expressionr&   ParquetFileFormatr   	enumeraterH   make_fragment
row_groupsr   num_rows
to_batchesrL   Tablefrom_batchesr   r_   rM   r   rN   r   rO   r   rP   rQ   )r   r9   filter_exprparquet_file_formatfile_idxrW   rX   parquet_fragmentr   	batch_idxrecord_batchr[   rY   s                r   _generate_tableszParquet._generate_tables   se     ;+0C0ONNdi.@.MNNNNNRXY]YdYlRmRmmm  H@S  H  Hrvr{  sE  H  H  H  
 $+-t44%B$T[%8999$ 	
 !2QUQ\Qrsss'.. 	Z 	ZNHdZ$%% W':'H'H'K'K$'2 W%)[%;%f?O?Z[\?]?f
7@,77+5(,(;'20134 8  8 8 W W3I| (*x'<'<l^'L'LH #&h	":":D<L<LX<V<V"VVVVV#W W W W W W W W W W W W W W W$ OZ0 Z Z Z;+w66LL!c!c!cDQRGGL\!c!c`a!c!cddd[-77NN#Z#Z#Z$q''BR#Z#ZVW#Z#Z#Z[[[[LL!Xt!X!XQ@P!X!XTU!X!X!XYYYZ)	Z 	Zs>   G'B9GG'G	G'"G	#G''K=CKKN)r   r   r   r
   BUILDER_CONFIG_CLASSr2   rZ   rL   rt   r_   ra   r|   r=   r   r   r,   r,   Z   s        (
C 
C 
C     DBH       'Z 'Z 'Z 'Z 'Zr   r,   )dataclassesr   typingr   r   r   pyarrowrL   pyarrow.datasetdatasetr&   pyarrow.parquetparquetrJ   r$   datasets.builderr   datasets.tabler   utilslogging
get_loggerr   rN   BuilderConfigr
   ArrowBasedBuilderr,   r=   r   r   <module>r      s9   ! ! ! ! ! ! + + + + + + + + + +                              % % % % % % 
		*	*8	4	4 F  F  F  F  F H* F  F  F RbZ bZ bZ bZ bZh( bZ bZ bZ bZ bZr   