
    PiL                        d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ  ee           Z!e G d d                      Z"e G d d                      Z# G d de$          Z% G d de$          Z&e G d d                      Z'e G d d                      Z( G d de)e*e(f                   Z+dS )aw  DatasetInfo record information we know about a dataset.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
    N)	dataclass)Path)ClassVarOptionalUnion)	url_to_fs)DatasetCardDatasetCardData   )config)Features)	SplitDict)Version)
get_logger)asdictunique_valuesc                   ,    e Zd ZU dZeed<   dZeed<   dS )SupervisedKeysData inputoutputN)__name__
__module____qualname__r   str__annotations__r        a/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/info.pyr   r   7   s/         E3OOOFCr   r   c                   ,    e Zd ZU dZeed<   dZeed<   dS )DownloadChecksumsEntryDatar   keyvalueN)r   r   r   r"   r   r   r#   r   r   r   r!   r!   =   s*         CMMME3OOOOOr   r!   c                       e Zd ZdZdS )MissingCachedSizesConfigErrorz;The expected cached sizes of the download file are missing.Nr   r   r   __doc__r   r   r   r%   r%   C   s        EEEEr   r%   c                       e Zd ZdZdS )NonMatchingCachedSizesErrorz/The prepared split doesn't have expected sizes.Nr&   r   r   r   r)   r)   G   s        9999r   r)   c                   j    e Zd ZU dZee         ed<   dZee         ed<   d Z	e
dedd fd            ZdS )PostProcessedInfoNfeaturesresources_checksumsc                     | j         :t          | j         t                    s"t          j        | j                   | _         d S d S d S N)r,   
isinstancer   	from_dictselfs    r   __post_init__zPostProcessedInfo.__post_init__P   sA    =$Zx-P-P$$.t}==DMMM %$$$r   post_processed_info_dictreturnc                     d t          j        |           D              | di fd|                                D             S )Nc                     h | ]	}|j         
S r   name.0fs     r   	<setcomp>z.PostProcessedInfo.from_dict.<locals>.<setcomp>W       ???!qv???r   c                 $    i | ]\  }}|v 	||S r   r   r<   kvfield_namess      r   
<dictcomp>z/PostProcessedInfo.from_dict.<locals>.<dictcomp>X   s*    \\\tq!1P[K[K[aK[K[K[r   r   dataclassesfieldsitems)clsr5   rD   s     @r   r1   zPostProcessedInfo.from_dictU   sY    ??{'9#'>'>???s]]\\\\'?'E'E'G'G\\\]]]r   )r   r   r   r,   r   r   r   r-   dictr4   classmethodr1   r   r   r   r+   r+   K   s         #'Hhx '''*.$...> > >
 ^ ^:M ^ ^ ^ [^ ^ ^r   r+   c                   6   e Zd ZU dZ ej        e          Zeed<    ej        e          Z	eed<    ej        e          Z
eed<    ej        e          Zeed<   dZee         ed<   dZee         ed	<   dZee         ed
<   dZee         ed<   dZee         ed<   dZee         ed<   dZeeeef                  ed<   dZee         ed<   dZee         ed<   dZee         ed<   dZee         ed<   dZ ee         ed<   dZ!ee         ed<   g dZ"e#e$e                  ed<   d Z%d+dee         fdZ&d,dZ'd Z(e)de$d          fd            Z*e)d-dedee         d d fd!            Z+e)d"ed d fd#            Z,d.d/d&Z-d0d'Z.d efd(Z/e)d)ed d fd*            Z0dS )1DatasetInfoa	  Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Not all fields are known on construction and may be updated later.

    Attributes:
        description (`str`):
            A description of the dataset.
        citation (`str`):
            A BibTeX citation of the dataset.
        homepage (`str`):
            A URL to the official homepage for the dataset.
        license (`str`):
            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features ([`Features`], *optional*):
            The features used to specify the dataset's column types.
        post_processed (`PostProcessedInfo`, *optional*):
            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (`SupervisedKeysData`, *optional*):
            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (`str`, *optional*):
            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. It is also the snake_case version of the dataset builder class name.
        config_name (`str`, *optional*):
            The name of the configuration derived from [`BuilderConfig`].
        version (`str` or [`Version`], *optional*):
            The version of the dataset.
        splits (`dict`, *optional*):
            The mapping between split name and metadata.
        download_checksums (`dict`, *optional*):
            The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (`int`, *optional*):
            The size of the files to download to generate the dataset, in bytes.
        post_processing_size (`int`, *optional*):
            Size of the dataset in bytes after post-processing, if any.
        dataset_size (`int`, *optional*):
            The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (`int`, *optional*):
            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
    )default_factorydescriptioncitationhomepagelicenseNr,   post_processedsupervised_keysbuilder_namedataset_nameconfig_nameversionsplitsdownload_checksumsdownload_sizepost_processing_sizedataset_sizesize_in_bytes)rX   r\   r^   r,   rZ   _INCLUDED_INFO_IN_YAMLc                 T   | j         8t          | j         t                    st          j        | j                   | _         | j        >t          | j        t
                    s$t
                              | j                  | _        | j        lt          | j        t                    sRt          | j        t                    rt          | j                  | _        nt          j        | j                  | _        | j	        8t          | j	        t                    st          j        | j	                  | _	        | j        ht          | j        t                    sPt          | j        t          t          f          rt          | j         | _        d S t          di | j        | _        d S d S d S )Nr   )r,   r0   r   r1   rT   r+   rY   r   r   rZ   r   from_split_dictrU   r   tuplelistr2   s    r   r4   zDatasetInfo.__post_init__   s^   =$Zx-P-P$$.t}==DM*:d>QSd3e3e*"3"="=d>Q"R"RD<#Jt|W,M,M#$,,, ?&t|44&0>>;":dk9+M+M"#3DK@@DK+Jt?SUg4h4h+$.>> R'94;O'P$$$'9'Q'QD<P'Q'Q$$$	 ,+++r   Fstorage_optionsc                    t          |fi |pi ^}}|                    t          j        |t          j                  d          5 }|                     ||           ddd           n# 1 swxY w Y   | j        rc|                    t          j        |t          j                  d          5 }| 	                    |           ddd           dS # 1 swxY w Y   dS dS )a  Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

        Args:
            dataset_info_dir (`str`):
                Destination directory.
            pretty_print (`bool`, defaults to `False`):
                If `True`, the JSON will be pretty-printed with the indent level of 4.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="validation")
        >>> ds.info.write_to_directory("/path/to/directory/")
        ```
        wb)pretty_printN)
r   open	posixpathjoinr   DATASET_INFO_FILENAME
_dump_inforS   LICENSE_FILENAME_dump_license)r3   dataset_info_dirrh   re   fs_r=   s          r   write_to_directoryzDatasetInfo.write_to_directory   s`   , +GG0E2GGQWWY^$4f6RSSUYZZ 	:^_OOALO999	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	:< 	&(8&:QRRTXYY &]^""1%%%& & & & & & & & & & & & & & & & & &	& 	&s$   A))A-0A-.CCCc                     |                     t          j        t          |           |rdnd                              d                     dS )zQDump info in `file` file-like object open in bytes mode (to support remote files)   Nindentutf-8)writejsondumpsr   encode)r3   filerh   s      r   rm   zDatasetInfo._dump_info   sG    

4:fTll3N11$OOOVVW^__`````r   c                 `    |                     | j                            d                     dS )zTDump license in `file` file-like object open in bytes mode (to support remote files)rx   N)ry   rS   r|   )r3   r}   s     r   ro   zDatasetInfo._dump_license   s*    

4<&&w//00000r   dataset_infosc                    d D             t                    dk    r#t          fdD                       rd         S d                    t          d D                                                                 }d                    t          d D                                                                 }d                    t          d D                                                                 }d                    t          d D                                                                 }d }d } | ||||||	          S )
Nc                 :    g | ]}||                                 S r/   )copy)r<   	dset_infos     r   
<listcomp>z*DatasetInfo.from_merge.<locals>.<listcomp>   s'    bbbiILa))LaLaLar   r   c              3   0   K   | ]}d          |k    V  dS )r   Nr   )r<   r   r   s     r   	<genexpr>z)DatasetInfo.from_merge.<locals>.<genexpr>   s-      )g)gI-*:i*G)g)g)g)g)g)gr   z

c              3   $   K   | ]}|j         V  d S r/   )rP   r<   infos     r   r   z)DatasetInfo.from_merge.<locals>.<genexpr>   s%      /[/[T0@/[/[/[/[/[/[r   c              3   $   K   | ]}|j         V  d S r/   )rQ   r   s     r   r   z)DatasetInfo.from_merge.<locals>.<genexpr>   $      ,U,UtT],U,U,U,U,U,Ur   c              3   $   K   | ]}|j         V  d S r/   )rR   r   s     r   r   z)DatasetInfo.from_merge.<locals>.<genexpr>   r   r   c              3   $   K   | ]}|j         V  d S r/   )rS   r   s     r   r   z)DatasetInfo.from_merge.<locals>.<genexpr>   s$      +S+STDL+S+S+S+S+S+Sr   )rP   rQ   rR   rS   r,   rU   )lenallrk   r   strip)rJ   r   rP   rQ   rR   rS   r,   rU   s    `      r   
from_mergezDatasetInfo.from_merge   s\   bb=bbb}!!c)g)g)g)gYf)g)g)g&g&g! ##kk-/[/[]/[/[/["["[\\bbdd;;},U,U},U,U,UUUVV\\^^;;},U,U},U,U,UUUVV\\^^++m+S+S]+S+S+SSSTTZZ\\s#+
 
 
 	
r   rp   r6   c                 l   t          |fi |pi ^}}t                              d|            |st          d          |                    t          j        |t          j                  dd          5 }t          j
        |          }ddd           n# 1 swxY w Y   |                     |          S )a   Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the [`DatasetInfo`].

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`):
                The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import DatasetInfo
        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
        ```
        zLoading Dataset info from zECalling DatasetInfo.from_directory() with undefined dataset_info_dir.rrx   encodingN)r   loggerdebug
ValueErrorri   rj   rk   r   rl   rz   loadr1   )rJ   rp   re   rq   rr   r=   dataset_info_dicts          r   from_directoryzDatasetInfo.from_directory   s    4 +GG0E2GGQD2BDDEEE 	fdeeeWWY^$4f6RSSUXcjWkk 	-op $	!	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	-}}.///s   5BBBr   c                     d t          j        |           D              | di fd|                                D             S )Nc                     h | ]	}|j         
S r   r9   r;   s     r   r>   z(DatasetInfo.from_dict.<locals>.<setcomp>  r?   r   c                 $    i | ]\  }}|v 	||S r   r   rA   s      r   rE   z)DatasetInfo.from_dict.<locals>.<dictcomp>  s)    UUUtq!ADTDTaDTDTDTr   r   rF   )rJ   r   rD   s     @r   r1   zDatasetInfo.from_dict  sY    ??{'9#'>'>???sVVUUUU'8'>'>'@'@UUUVVVr   Tother_dataset_infoc                 v    | j         } |j        di fd|j                                         D              d S )Nc                 H    i | ]\  }}|	|t          j        |          S r/   r   deepcopy)r<   rB   rC   ignore_nones      r   rE   z&DatasetInfo.update.<locals>.<dictcomp>!  s:       AqMM 4=##!MMr   r   )__dict__updaterI   )r3   r   r   	self_dicts     ` r   r   zDatasetInfo.update  sj    M		 	
 	
   .7==??  	
 	
 	
 	
 	
r   c                 ^     | j         di d | j                                        D             S )Nc                 >    i | ]\  }}|t          j        |          S r   r   )r<   rB   rC   s      r   rE   z$DatasetInfo.copy.<locals>.<dictcomp>)  s(     W W WADM!$4$4 W W Wr   r   )	__class__r   rI   r2   s    r   r   zDatasetInfo.copy(  s7    t~XX W WATATAVAV W W WXXXr   c                    i }t          |           }|D ]p}|| j        v ret          | |          }t          |d          r|                                ||<   Ct          |d          r|                                ||<   k|||<   q|S )N_to_yaml_list_to_yaml_string)r   r`   getattrhasattrr   r   )r3   	yaml_dictr   r"   r#   s        r   _to_yaml_dictzDatasetInfo._to_yaml_dict+  s    	"4LL$ 	+ 	+Cd111c**5/22 +%*%8%8%:%:IcNNU$566 +%*%:%:%<%<IcNN%*IcNr   	yaml_datac                 |   t          j        |          }|                    d          t          j        |d                   |d<   |                    d          t          j        |d                   |d<   d t          j        |           D              | di fd|                                D             S )Nr,   rZ   c                     h | ]	}|j         
S r   r9   r;   s     r   r>   z.DatasetInfo._from_yaml_dict.<locals>.<setcomp>@  r?   r   c                 $    i | ]\  }}|v 	||S r   r   rA   s      r   rE   z/DatasetInfo._from_yaml_dict.<locals>.<dictcomp>A  s)    MMMtq!A<L<La<L<L<Lr   r   )	r   r   getr   _from_yaml_listr   rG   rH   rI   )rJ   r   rD   s     @r   _from_yaml_dictzDatasetInfo._from_yaml_dict9  s    M),,	==$$0$,$<Yz=R$S$SIj!==""."+";Ih<O"P"PIh??{'9#'>'>???sNNMMMMy'8'8MMMNNNr   )FN)Fr/   )T)r   rN   )r6   rN   )1r   r   r   r'   rG   fieldr   rP   r   rQ   rR   rS   r,   r   r   rT   r+   rU   r   rV   rW   rX   rY   r   r   rZ   r   r[   rK   r\   intr]   r^   r_   r`   r   rd   r4   rs   rm   ro   rL   r   r   r1   r   r   r   r   r   r   r   rN   rN   [   si        * *Z ){(===K===%K%c:::Hc:::%K%c:::Hc:::$;$S999GS999#'Hhx '''26NH./66648OX01888 #'L(3-&&&"&L(3-&&&!%K#%%%-1GXeCL)*111"&FHY&&&)----#'M8C='''*.(3-..."&L(3-&&&#'M8C='''3 3 3HT#Y/   R R R&& &X`aeXf & & & &:a a a a1 1 1 
tM': 
 
 
 [
. 0 0c 0HTN 0^k 0 0 0 [0B W$ W= W W W [W
 
 
 
 
Y Y Y Yt     O O O O O [O O Or   rN   c                   `    e Zd Zd	d
dZedd            Zededd fd            ZdeddfdZdS )DatasetInfosDictFr6   Nc                 f   i }t           j                            |t          j                  }t           j                            |t          j                  }|s|                     |          }|                    |            t           j                            |          rct          |dd          5 }d |
                                D             }t          j        |||rdnd            d d d            n# 1 swxY w Y   t           j                            |          rt          j        |          }	|	j        }
nd }	t!                      }
|r_|                    |
           |	"t          dt%          |
          z   dz             n|	}	|	                    t)          |                     d S d S )	Nwrx   r   c                 4    i | ]\  }}|t          |          S r   )r   r<   rX   r   s      r   rE   z7DatasetInfosDict.write_to_directory.<locals>.<dictcomp>O  s3     & & &7M{IK	!2!2& & &r   ru   rv   z---
z
---
)ospathrk   r   DATASETDICT_INFOS_FILENAMEREPOCARD_FILENAMEr   r   existsri   rI   rz   dumpr	   r   datar
   to_dataset_card_datar   saver   )r3   dataset_infos_dir	overwriterh   total_dataset_infosdataset_infos_pathdataset_readme_pathr=   dataset_infos_dictdataset_carddataset_card_datas              r   rs   z#DatasetInfosDict.write_to_directoryE  s    W\\*;V=^__ gll+<f>VWW 	I"&"5"56G"H"H""4(((7>>,-- 	U(#@@@ UA& &QdQjQjQlQl& & &" 	,a\8StTTTT	U U U U U U U U U U U U U U U 7>>-.. 	2&+,?@@L , 1L / 1 1 	9445FGGGMYMaGc*;&<&<<yHIIIgs  d#67788888	9 	9s   4:C::C>C>c                    t                               d|            t          j                            t          j                            |t          j                            rLt          j	        t          |          t          j        z            j        }d|v r|                     |          S t          j                            t          j                            |t          j                            rt          t          j                            |t          j                  d          5 } | d t          j	        |                                          D                       cd d d            S # 1 swxY w Y   d S  |             S )NzLoading Dataset Infos from dataset_inforx   r   c                 J    i | ] \  }}|t                               |          !S r   )rN   r1   )r<   rX   r   s      r   rE   z3DatasetInfosDict.from_directory.<locals>.<dictcomp>m  s>       :K): $[%:%:;L%M%M  r   )r   r   r   r   r   rk   r   r   r	   r   r   r   from_dataset_card_datar   ri   rz   rI   )rJ   r   r   r=   s       r   r   zDatasetInfosDict.from_directorya  s   F3DFFGGG7>>"',,'8&:RSSTT 	E + 06G1H1H6Kc1c d d i!222112CDDD7>>"',,'8&:[\\]] 
	bgll#4f6WXXcjkkk ops >Bill>P>P>R>R                     355Ls   19E77E;>E;r   c                    t          |                    d          t          t          f          rt          |d         t                    r | d |d         D                       S t                              |d                   }|d                             dd          |_         | |j        |i          S  |             S )Nr   c                 l    i | ]1}|                     d d          t                              |          2S )rX   default)r   rN   r   )r<   dataset_info_yaml_dicts     r   rE   z;DatasetInfosDict.from_dataset_card_data.<locals>.<dictcomp>z  sQ        3 /22=)LLkNiNi2O O  r   rX   r   )r0   r   rd   rK   rN   r   rX   )rJ   r   r   s      r   r   z'DatasetInfosDict.from_dataset_card_datau  s    '++N;;dD\JJ 	+N;TBB Es  7H6W	      +::;L^;\]]+<^+L+P+PQ^`i+j+j(sL4lCDDD355Lr   c                 :   | rd|v r@t          |d         t                    r%|d                             dd          |d         i}n4d|v r.t          |d         t                    rd |d         D             }ni }i |d |                                 D             }|                                D ]
\  }}||d<   t          |          dk    rft          t          |                                                    |d<   |d         	                    dd           }|dk    rd|i|d         |d<   d S d S g |d<   t          |                                          D ]>\  }}|	                    dd            d|i|}|d                             |           =d S d S )Nr   rX   r   c                      i | ]}|d          |S )rX   r   )r<   config_metadatas     r   rE   z9DatasetInfosDict.to_dataset_card_data.<locals>.<dictcomp>  s/     * * *' $M2O* * *r   c                 >    i | ]\  }}||                                 S r   )r   r   s      r   rE   z9DatasetInfosDict.to_dataset_card_data.<locals>.<dictcomp>  s+    eee>Tk9;	 7 7 9 9eeer   r   )r0   rK   r   rd   rI   r   nextitervaluespopsortedappend)r3   r   dataset_metadata_infosr   rX   dset_info_yaml_dictr   s          r   r   z%DatasetInfosDict.to_dataset_card_data  s=    %	U!222zBSTbBcei7j7j2%n599-SSUfguUv*&&  #444DUVdDegk9l9l4* *+<^+L* * *&&
 *,&#(#eeX\XbXbXdXdeee#
 5H4M4M4O4O A A005@#M22&''1,,48>Q>X>X>Z>Z9[9[4\4\!.1/?CCMSWXX)++ &{9+N;9%n555 ,+ 57!.1;ABUB[B[B]B];^;^ U U7K!7*..}dCCC.;[-cLb-c*%n5<<=STTTTK%	U %	UBU Ur   )FF)r6   N)r6   r   )	r   r   r   rs   rL   r   r
   r   r   r   r   r   r   r   D  s        9 9 9 9 98    [&  K]    [$&Uo &U$ &U &U &U &U &U &Ur   r   ),r'   r   rG   rz   r   rj   r   pathlibr   typingr   r   r   fsspecfsspec.corer   huggingface_hubr	   r
   r   r   r,   r   rZ   r   utilsr   utils.loggingr   utils.py_utilsr   r   r   r   r   r!   	Exceptionr%   r)   r+   rN   rK   r   r   r   r   r   <module>r      s           				     ! ! ! ! ! !       , , , , , , , , , ,  ! ! ! ! ! ! 8 8 8 8 8 8 8 8                         % % % % % % 1 1 1 1 1 1 1 1 
H		        
        
F F F F FI F F F: : : : :) : : : ^ ^ ^ ^ ^ ^ ^ ^ eO eO eO eO eO eO eO eOPjU jU jU jU jUtC,- jU jU jU jU jUr   