
    Pi}                        d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ  d
dl!m"Z"m#Z#m$Z$m%Z%m&Z& d
dl'm(Z( ee)e*e*f         e)e*         e)d         f         Z+ e*ej,                  Z- ej.        e/          Z0 G d de*          Z1 G d de2          Z3dZ4ej,        ddgej5        g dej6        g diZ7dZ8ej9         ej:        d          k     r	dd gZ;g d!Z<n*ej9         ej:        d"          k     r	d#d gZ;g d$Z<nd%d&gZ;g d'Z<ej,        ej5        ej6        gZ=d( e=D             Z>d) e=D             Z?ej,        d*giZ@d+d,giZAe4gZBeAe?e>e@gZCd-ZDg d.ZEd/e*d0eFfd1ZGd2eeHeIe*f         d0eHe*eeIe*         d3f         f         fd4ZJd5e*d/e*d0eFfd6ZKd5e*d/e*d0eFfd7ZLd8e
e*geIe*         f         d0eHe*eIe*         f         fd9ZM	 	 dKd/e*d:e*d;eeIe*                  d<ee         d0eIe*         f
d=ZNdLd:e*d<ee         d0eHe*eIe*         f         fd>ZO	 dLd?e*d<ee         d0e+fd@ZP	 	 dKdAeIe*         d<ee         dBeeQ         d0eIe+         fdCZR G dD d3eIe*                   ZS G dE dFeHe*eSf                   ZT G dG dHeIe*                   ZU G dI dJeHe*eUf                   ZVdS )M    N)partial)	has_magic)PathPurePath)CallableOptionalUnion)	url_to_fs)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)string_to_dict c                       e Zd ZdS )UrlN__name__
__module____qualname__r       g/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/data_files.pyr   r   !           Dr"   r   c                       e Zd ZdS )EmptyDatasetErrorNr   r   r"   r#   r&   r&   %   r$   r"   r&   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**c                 >    i | ]}|d  t           |         D             S )c                 Z    g | ](}t           D ]}|                    |t                     )S )keywordsep)"KEYWORDS_IN_FILENAME_BASE_PATTERNSformatNON_WORDS_CHARS.0r4   patterns      r#   
<listcomp>z<dictcomp>.<listcomp>L   O       9   	wO<<   r"   SPLIT_KEYWORDSr:   splits     r#   
<dictcomp>rB   K   K     & & & 	 
  %e,  & & &r"   c                 >    i | ]}|d  t           |         D             S )c                 Z    g | ](}t           D ]}|                    |t                     )S r3   )"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSr7   r8   r9   s      r#   r<   z<dictcomp>.<listcomp>T   r=   r"   r>   r@   s     r#   rB   rB   S   rC   r"   z**logsz	**/*.evalz*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr;   returnc                 D     t           fdt          D                       S )Nc              3       K   | ]}|v V  	d S Nr   )r:   wildcard_characterr;   s     r#   	<genexpr>z%contains_wildcards.<locals>.<genexpr>v   s*      [[1C!W,[[[[[[r"   )anyWILDCARD_CHARACTERS)r;   s   `r#   contains_wildcardsrP   u   s'    [[[[GZ[[[[[[r"   patternsDataFilesListc                    t          | t                    rd |                                 D             S t          | t                    r
t          | giS t          | t
                    rt          d | D                       r| D ]o}t          |t                    rFt          |          dk    r3d|v r/t          |                    d          t          t
          f          st          d|           pd | D             }t          t          |                    t          |          k    rt          d|           d	 | D             S t          | iS t          t          |                     S )
a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c                 d    i | ]-\  }}t          |          t          |t                    r|n|g.S r   str
isinstancelist)r:   keyvalues      r#   rB   z%sanitize_patterns.<locals>.<dictcomp>   s:    kkkJCQVC:eT#:#:G%%kkkr"   c              3   @   K   | ]}t          |t                    V  d S rK   )rW   dictr:   r;   s     r#   rM   z$sanitize_patterns.<locals>.<genexpr>   s,      AAWz'4((AAAAAAr"      rA   pathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got c                     g | ]
}|d          S rA   r   r]   s     r#   r<   z%sanitize_patterns.<locals>.<listcomp>   s    ???7gg&???r"   z*Some splits are duplicated in data_files: c                     i | ]B}t          |d                    t          |d         t                    r|d         n|d         gCS )rA   r_   rU   r]   s     r#   rB   z%sanitize_patterns.<locals>.<dictcomp>   s]        GG$%%*WV_VZ:[:['rwvbijpbqar  r"   )rW   r\   itemsrV   SANITIZED_DEFAULT_SPLITrX   rN   lenget
ValueErrorsetsanitize_patterns)rQ   r;   splitss      r#   ri   ri   y   s    (D!! 1kkZbZhZhZjZjkkkk	Hc	"	" 1'(44	Hd	#	# 1AAAAAAA 	7# 	 	w--G))7**"7;;v#6#6dDD + % Bx  B  B   + @?h???F3v;;3v;;.. !Vf!V!VWWW '   
 ,X66 h000r"   matched_rel_pathc                     d t          |           j        j        D             }d t          |          j        j        D             }t          |          t          |          k    S )u  
    When a path matches a pattern, we additionally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    c                 <    g | ]}|                     d           |S __
startswithr:   parts     r#   r<   z6_is_inside_unrequested_special_dir.<locals>.<listcomp>   s-    "u"u"uD_c_n_nos_t_t"u4"u"u"ur"   c                 <    g | ]}|                     d           |S rn   rp   rr   s     r#   r<   z6_is_inside_unrequested_special_dir.<locals>.<listcomp>   s-    %o%o%otY]YhYhimYnYn%od%o%o%or"   )r   parentpartsre   )rk   r;   data_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patterns       r#   "_is_inside_unrequested_special_dirry      sg    8 #v"uH=M4N4N4U4["u"u"u%o%ox7H7H7O7U%o%o%o"*++s3Q/R/RRRr"   c                     d t          |           j        D             }d t          |          j        D             }t          |          t          |          k    S )u9  
    When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    c                 d    g | ]-}|                     d           rt          |          d hk    +|.S .rq   rh   rr   s     r#   r<   zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>   sJ     " " "T__S=Q=Q"Z]^bZcZchkglZlZlZlZlZlr"   c                 d    g | ]-}|                     d           rt          |          d hk    +|.S r|   r~   rr   s     r#   r<   zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>   sJ     % % %DOOC4H4H%QTUYQZQZ_b^cQcQcQcQcQcr"   )r   rv   re   )rk   r;   hidden_directories_in_pathhidden_directories_in_patterns       r#   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dirr      sw    j" "!"2339" " "% %!'**0% % %! )**c2O.P.PPPr"   pattern_resolverc                   	
 t           D ]

                    dd          }	  | |          }n# t          $ r Y 2w xY wt          |          dk    rt	                      |D ]K}t          t          |          t          
                    }|J                     |d                    Lt          d D                       rt          dt           d d	          fd
t          D             t          d t          D             z
            z   }
fd|D             c S  t          D ]|	g }	                                D ]Q\  }}|D ]I}	  | |          }n# t          $ r Y w xY wt          |          dk    r|                    |            nJR|r	fd|D             c S }t          d| d|            )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   NrA   c              3   L   K   | ]}t          j        t          |           V   d S rK   )rematchr   r@   s     r#   rM   z+_get_data_files_patterns.<locals>.<genexpr>  s1      FFerx	5111FFFFFFr"   zSplit name should match 'z'' but got 'z'.c                 6    g | ]}|v t          |          S r   rV   )r:   rA   rj   s     r#   r<   z,_get_data_files_patterns.<locals>.<listcomp>  s%    WWWEuPVSZZr"   c                 ,    h | ]}t          |          S r   r   r@   s     r#   	<setcomp>z+_get_data_files_patterns.<locals>.<setcomp>  s    AAA#e**AAAr"   c                 @    i | ]}|                     |           gS )ra   )r7   )r:   rA   split_patterns     r#   rB   z,_get_data_files_patterns.<locals>.<dictcomp>  s/    ZZZ5EM00u0==>ZZZr"   c                 "    i | ]}||         S r   r   )r:   rA   patterns_dicts     r#   rB   z,_get_data_files_patterns.<locals>.<dictcomp>&  s     NNNEE=/NNNr"   zCouldn't resolve pattern z with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorre   rh   r   r   addrN   rg   r   DEFAULT_SPLITSsortedALL_DEFAULT_PATTERNSrc   append)r   r;   
data_filespp_partssorted_splitsnon_empty_splitsrA   rQ   r   r   rj   s            @@@r#   _get_data_files_patternsr      s    , [ [''	377	))'22JJ  	 	 	H	z??Q"uuF - -(1y7O7OPP***

77+,,,,FFvFFFFF ` !^Y!^!^TZ!^!^!^___WWWW^WWWZ`AA.AAAA[ [ M [ZZZMZZZZZZ  . O O,2244 	 	OE8#  !1!1'!:!:JJ(   Hz??Q&&$++E222E '  	ONNNN=MNNNNNN	O
bbbP`bb
c
ccs   1
>>E  
E-	,E-		base_pathallowed_extensionsdownload_configc                    t          |           rt          ||           } nDt          |           r3t          j                            |           d         t          j        z   }nd}t          | |          \  } }t          | fi |\  }t          t                    t          |           hz
  d| v r|                     d          d         n-t          |j        t                    r|j        n|j        d         }|dk    r|dz   ndi }|dk    rd|d<   fd	 |j        | fd
di|                                D             }{fd|D             }	t%          |	          t%          |          k     rLt'          t          |          t          |	          z
            }
t(                              d|  d|
            n|}	|	s,d|  d}|dt'                     z  }t-          |          |	S )a  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicitly mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
        download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   ://filehfFexpand_infoc                 H   g | ]\  }}|d          dk    sQ|                     d          rxt          j                            t          j                            |                    r<t          |          vst          |          t          |          d|v r|n|z   S )typer   islinkr   )rf   osr_   isfilerealpathr   ry   r   )r:   filepathinfofiles_to_ignore
fs_patternprotocol_prefixs      r#   r<   z#resolve_pattern.<locals>.<listcomp>m  s       HdLF""txx'9'9"bgnnRWM]M]^fMgMg>h>h"x  7728ZHH 8OPXZdee 8 X%%?X+E 877r"   detailTNc                     g | ]G}t          fd t          |                              d          dd         D                       E|HS )c              3   &   K   | ]}d |z   v V  dS )r}   Nr   )r:   suffixr   s     r#   rM   z-resolve_pattern.<locals>.<listcomp>.<genexpr>y  s-      gg&3<#55ggggggr"   r}   r   N)rN   r   rA   )r:   r   r   s     r#   r<   z#resolve_pattern.<locals>.<listcomp>v  so     
 
 
ggggIhDWDWD]D]^aDbDbcdceceDfggggg

 
 
r"   z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   r   r_   
splitdriver5   r   r
   rh   FILES_TO_IGNOREr   rA   rW   protocolrV   globrc   re   rX   loggerr   r   )r;   r   r   r   storage_optionsfsr   glob_kwargsmatched_pathsoutinvalid_matched_files	error_msgr   r   r   s     `         @@@r#   resolve_patternr   *  s   `    	7++	w		 G&&w//2RV;			@ZijjjG_w::/::NB
/**i.@.@-AAO G 	eQ'S99Mbkkr{1~ 
 +3f*<*<h&&"OK4%*M"     %bggJJdJkJJPPRR  M %
 
 
 
)
 
 

 s88c-(((($(]););c#hh)F$G$G!KK A7  A  Ai~  A  A    +1w111	)T$?Q:R:RTTTI	***Jr"   c                     t          t          | |          }	 t          |          S # t          $ r t	          d|  d          dw xY w)uA
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {'train': ['**']}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    )r   r   zThe directory at z doesn't contain any data filesN)r   r   r   r   r&   )r   r   resolvers      r#   get_data_patternsr     si    h )_]]]Hj'111 j j j ^I ^ ^ ^__eiijs	   ( A	data_filec                 d   t          | |          \  } }t          | fi |^}}t          |t                    r#|                    |           }|j        |j        fS |                     t          j	                  rt          t          j	        |j
                  }d| t          t          j	                  dz   d                              ddd          z   } |                    |           }|j        |j        fS |                    |           }dD ]}||v rt          ||                   fc S dS )	Nr   )endpointtokenhf://r   z	/resolve/@)ETagetagmtimer   )r   r
   rW   r   resolve_pathrepo_idrevisionrq   r   HF_ENDPOINTr   re   r   r   rV   )	r   r   r   r   _resolved_pathhffsr   rY   s	            r#   _get_single_origin_metadatar     sD    "C9^m!n!n!nIy44O44FB"l## =	22$m&<<<			f0	1	1 =V%7?TUUUiF,>(?(?!(C(E(EFNN{\_abccc	)))44$m&<<<779D( % %$;;S	NN$$$$ 2r"   r   max_workersc           
      :   ||nt           j        }t          d | D                       r1fdt          | dt	          |           dk    pd           D             S t          t          t                    | |t          dt	          |           dk    pd           S )Nc              3      K   | ]}d |v V  	dS )r   Nr   )r:   r   s     r#   rM   z'_get_origin_metadata.<locals>.<genexpr>  s'      
<
<I7i
<
<
<
<
<
<r"   c                 2    g | ]}t          |           S )r   )r   )r:   r   r   s     r#   r<   z(_get_origin_metadata.<locals>.<listcomp>  s6     
 
 
 (	?SSS
 
 
r"   zResolving data files   )descdisabler   )r   
tqdm_classr   r   )r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSallhf_tqdmre   r   r   r   )r   r   r   s    ` r#   _get_origin_metadatar     s    
 "-!8++f>kK

<
<
<
<
<<< 

 
 
 
$+J2-5	  
 
 
 	
 +_MMM#J2%-   r"   c                       e Zd ZdZdee         dee         ddf fdZddZe		 	 	 dd	ee         d
e
j        j        dee         deee                  dee         dd fd            Ze		 	 	 dd	ee         dee         deee                  dee         dd f
d            Ze		 	 	 dd	ee         dee         deee                  dee         dd f
d            Zddddeee                  deee                  dd fdZ xZS )rR   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns:
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    r   origin_metadatarH   Nc                 X    t                                          |           || _        d S rK   )super__init__r   )selfr   r   	__class__s      r#   r   zDataFilesList.__init__*  s)    $$$.r"   otherc                 D    t          g | || j        |j        z             S rK   )rR   r   r   r   s     r#   __add__zDataFilesList.__add__.  s%    _t_e_d.BUEZ.Z[[[r"   rQ   dataset_infor   r   r   c                     d|j          d|j         d|pd                     d          }|                     ||||          S )Nzhf://datasets/r   /r   r   r   r   )idsharstripfrom_patterns)clsrQ   r   r   r   r   s         r#   from_hf_repozDataFilesList.from_hf_repo1  sc     \\_[[|7G[[)/WY[[bbcfgg	  	>Pbq ! 
 
 	
r"   c                     ||n1t                                                                                      }|                     ||||          S Nr   )r   resolveas_posixr   )r   rQ   r   r   r   s        r#   from_local_or_remotez"DataFilesList.from_local_or_remote?  sT     "+!6IIDFFNN<L<L<U<U<W<W	  	>Pbq ! 
 
 	
r"   c           	      D   ||n1t                                                                                      }g }|D ]I}	 |                    t	          ||||                     *# t
          $ r t          |          s Y Fw xY wt          ||          } | ||          S Nr   r   )r   r   r   extendr   r   r   r   )r   rQ   r   r   r   r   r;   r   s           r#   r   zDataFilesList.from_patternsL  s     "+!6IIDFFNN<L<L<U<U<W<W	
 	 	G!!#"++=(7	      %    ))   /z?[[[s:///s   &A%%BB
extensions
file_namesr  r  c                   g |rJd                     d |D                       }                    t          j        d| d                     |rJd                     d |D                       }                    t          j        d| d                     r"t	          fd| D             | j        	          S t	          t          |           | j        	          S )
N|c              3   >   K   | ]}t          j        |          V  d S rK   r   escape)r:   exts     r#   rM   z'DataFilesList.filter.<locals>.<genexpr>k  s*      "H"Hc29S>>"H"H"H"H"H"Hr"   z.*(z	)(\..+)?$c              3   >   K   | ]}t          j        |          V  d S rK   r	  )r:   fns     r#   rM   z'DataFilesList.filter.<locals>.<genexpr>n  s*      !E!EB")B--!E!E!E!E!E!Er"   z.*[\/]?(z)$c                 L    g | ]t          fd D                        S )c              3   B   K   | ]}|                               V  d S rK   )r   )r:   r;   r   s     r#   rM   z2DataFilesList.filter.<locals>.<listcomp>.<genexpr>r  s0      7i7iU\i8P8P7i7i7i7i7i7ir"   )rN   )r:   r   rQ   s    @r#   r<   z(DataFilesList.filter.<locals>.<listcomp>r  s=    jjjyC7i7i7i7i`h7i7i7i4i4ijjjjr"   )r   )joinr   r   compilerR   r   rX   )r   r  r  ext_pattern
fn_patternrQ   s        @r#   filterzDataFilesList.filterf  s     	G(("H"HZ"H"H"HHHKOOBJ'D['D'D'DEEFFF 	D!E!E*!E!E!EEEJOOBJ'A:'A'A'ABBCCC 	S jjjjDjjj $ 4   
 !dT=QRRRRr"   )r   rR   rH   rR   NNN)r   r    r!   __doc__rX   rV   SingleOriginMetadatar   r   classmethodhuggingface_hubhf_apiDatasetInfor   r   r   r   r   r  __classcell__r   s   @r#   rR   rR     sO        "/49 /tDX?Y /^b / / / / / /\ \ \ \ 
 $(2648
 
s)
 &,8
 C=	

 %T#Y/
 ".1
 

 
 
 [
  $(2648

 

s)

 C=

 %T#Y/	


 ".1

 


 

 

 [

  $(26480 0s)0 C=0 %T#Y/	0
 ".10 
0 0 0 [04 48[_S S S%d3i0SEMdSViEXS	S S S S S S S Sr"   c                   &   e Zd ZdZe	 	 	 ddeeeee         e	f         f         de
e         de
ee                  de
e         dd f
d            Ze	 	 	 ddeeeee         e	f         f         d	ej        j        de
e         de
ee                  de
e         dd fd
            Ze	 	 	 ddeeeee         e	f         f         de
e         de
ee                  de
e         dd f
d            Zdddde
ee                  de
ee                  dd fdZdS )DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover, each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see [`DataFilesList`].

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrQ   r   r   r   rH   c                      |             }|                                 D ]=\  }}t          |t                    r|nt                              ||||          ||<   >|S r   )rc   rW   rR   r   r   rQ   r   r   r   r   rY   patterns_for_keys           r#   r   z"DataFilesDict.from_local_or_remote  s     cee%-^^%5%5 
	 
	!C! .>>  "77$''9$3	 8   HH 
r"   r   c                      |             }|                                 D ]>\  }}t          |t                    r|nt                              |||||          ||<   ?|S )N)r   r   r   r   )rc   rW   rR   r   )	r   rQ   r   r   r   r   r   rY   r"  s	            r#   r   zDataFilesDict.from_hf_repo  s     cee%-^^%5%5 	 	!C! .>>  "//$!-''9$3 0   HH 
r"   c                      |             }|                                 D ]=\  }}t          |t                    r|nt                              ||||          ||<   >|S r   )rc   rW   rR   r   r!  s           r#   r   zDataFilesDict.from_patterns  s     cee%-^^%5%5 
	 
	!C! .>>  "00$''9$3	 1   HH 
r"   r  r  r  c                     t          |                       }|                                 D ]\  }}|                    ||          ||<    |S )Nr  )r   rc   r  )r   r  r  r   rY   data_files_lists         r#   r  zDataFilesDict.filter  sW     d4jjll$(JJLL 	\ 	\ C&--PZ-[[CHH
r"   r  )r   r    r!   r  r  r\   rV   r	   rX   rR   r   r   r   r  r  r  r   r   r  r   r"   r#   r  r  y  s          $(2648 sE$s)]":;;< C= %T#Y/	
 ".1 
   [* 
 $(2648 sE$s)]":;;< &,8 C=	
 %T#Y/ ".1 
   [.  $(2648 sE$s)]":;;< C= %T#Y/	
 ".1 
   [, 48[_  %d3i0EMdSViEX	     r"   r  c                        e Zd ZdZdee         deeee                           f fdZd Ze		 ddee         deee                  dd fd            Z
	 dd	ed
ee         ddfdZdee         dd fdZ xZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    rQ   r   c                 X    t                                          |           || _        d S rK   )r   r   r   )r   rQ   r   r   s      r#   r   zDataFilesPatternsList.__init__  s+    
 	""""4r"   c                 D    t          g | || j        |j        z             S rK   )rR   r   r   s     r#   r   zDataFilesPatternsList.__add__  s%    _t_e_d.EH`.`aaar"   NrH   c                 <     | ||gt          |          z            S rK   )re   )r   rQ   r   s      r#   r   z#DataFilesPatternsList.from_patterns  s%     s801CMMABBBr"   r   r   rR   c           	      x   ||n1t                                                                                      }g }t          | | j                  D ]L\  }}	 |                    t          ||||                     -# t          $ r t          |          s Y Iw xY wt          ||          }t          ||          S r  )r   r   r   zipr   r  r   r   r   r   rR   )r   r   r   r   r;   r   r   s          r#   r   zDataFilesPatternsList.resolve  s    
 "+!6IIDFFNN<L<L<U<U<W<W	
+.tT5L+M+M 	 	'G'!!#"++=(7	      %    ))   /z?[[[Z999s   &A;;BBr  c                 F    t          | fd| j        D                       S )Nc                     g | ]}|z   S r   r   )r:   r   r  s     r#   r<   z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>  s     eee7I%
2eeer"   )r(  r   )r   r  s    `r#   filter_extensionsz'DataFilesPatternsList.filter_extensions  s2    $eeeeTMdeee
 
 	
r"   rK   )r   r    r!   r  rX   rV   r   r   r   r  r   r   r   r0  r  r  s   @r#   r(  r(    s8        5s)5 !$s)!455 5 5 5 5 5b b b LPC CCyC6>tCy6IC	 C C C [C 59: :: ".1: 
	: : : :.
DI 
:Q 
 
 
 
 
 
 
 
r"   r(  c                       e Zd ZdZe	 ddeeee         f         deee                  dd fd            Z		 ddedee
         dd	fd
Zdee         dd fdZdS )DataFilesPatternsDictz[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    NrQ   r   rH   c                      |             }|                                 D ];\  }}t          |t                    r|nt                              ||          ||<   <|S )N)r   )rc   rW   r(  r   )r   rQ   r   r   rY   r"  s         r#   r   z#DataFilesPatternsDict.from_patterns  sz     cee%-^^%5%5 	 	!C! .0EFF  *88$'9 9   HH 
r"   r   r   r  c                     t                      }|                                 D ]\  }}|                    ||          ||<   |S rK   )r  rc   r   )r   r   r   r   rY   data_files_patterns_lists         r#   r   zDataFilesPatternsDict.resolve   sL    
 oo-1ZZ\\ 	T 	T)C)/77	?SSCHH
r"   r  c                      t          |                       }|                                 D ]\  }}|                    |          ||<   |S rK   )r   rc   r0  )r   r  r   rY   r5  s        r#   r0  z'DataFilesPatternsDict.filter_extensions*  sP    d4jjll-1ZZ\\ 	N 	N)C)/AA*MMCHH
r"   rK   )r   r    r!   r  r  r\   rV   rX   r   r   r   r   r0  r   r"   r#   r2  r2    s          W[ CcN+AI$s)AT	    [$ 59  ".1 
	   DI :Q      r"   r2  )NNrK   )Wr   r   	functoolsr   r   r   pathlibr   r   typingr   r   r	   r  fsspec.corer
   r   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   namingr   rj   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   tuplerV   r  TRAINrd   
get_loggerr   r   r   r   r&   SPLIT_PATTERN_SHARDED
VALIDATIONTESTr?   r8   FSSPEC_VERSIONparser6   rF   r   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLDEFAULT_PATTERNS_LOGSr   r   rO   r   boolrP   r\   rX   ri   ry   r   r   r   r   r   intr   rR   r  r(  r2  r   r"   r#   <module>rP     s   				 				             " " " " " " " " , , , , , , , , , ,     ! ! ! ! ! ! ( ( ( ( ( (       . . . . . .       $ $ $ $ $ $                   " " " " " " r r r r r r r r r r r r r r * * * * * * U38_eCj%)CD  #ek**  
	H	%	%	 	 	 	 	# 	 	 		 	 	 	 	) 	 	 	 a  
K':&	;;;	J999
 	=7=4444*GI\)]&* * *&& ]W];7777*IK^)_&* * *&& +AB`)a&* * *& +u/<& &  & & & "& &  & & & " 
K$   +/ +, &&	     \ \ \ \ \ \#1dD#o 6 #14U4PS9VeKeEf@f;g #1 #1 #1 #1LS Ss St S S S SB;QVY ;Qdg ;Qlp ;Q ;Q ;Q ;Q|)dxtCy8H/I )ddSVX\]`XaSaNb )d )d )d )d^ /304	] ]]] !c+] n-	]
 
#Y] ] ] ]@Xj Xj Xjx7O Xj[_`ceijmen`n[o Xj Xj Xj Xjz 15 n-    0 15!% S	n- # 

	   :^S ^S ^S ^S ^SDI ^S ^S ^SBZ Z Z Z ZDm+, Z Z Zz2
 2
 2
 2
 2
DI 2
 2
 2
j# # # # #D&;!;< # # # # #r"   