
    Pi1                     D   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&  ee'          Z( G d dej)                  Z* G d d          Z+dS )zDownload manager interface.    N)datetime)partial)OptionalUnion)	url_to_fs)
thread_map   )config)tqdm)ArchiveIterableFilesIterablecached_pathis_relative_path,stack_multiprocessing_download_progress_barsurl_or_path_join)get_size_checksum_dict)
get_loggerr   )NestedDataStructure
map_nested)tracked_str   )DownloadConfigc                       e Zd ZdZdZdZdZdS )DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOAD     v/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/download/download_manager.pyr   r   2   s.          83)r&   r   c            
       T   e Zd ZdZ	 	 	 	 	 ddee         dee         dee         dee         fdZed	             Z	ed
             Z
dedefdZd Zdee         dedee         fdZdededefdZdeeej        f         fdZdeeee         f         fdZd Zd Zd Zd Zd ZdS )DownloadManagerFNTdataset_namedata_dirdownload_config	base_pathc                     || _         || _        |pt          j                            d          | _        i | _        || _        |pt                      | _	        i | _
        i | _        dS )a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        .N)_dataset_name	_data_dirospathabspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r,   downloaded_pathsextracted_paths)selfr*   r+   r,   r-   r7   s         r'   __init__zDownloadManager.__init__J   sf    2 *!#;rws';';Z\& 0.B.2B2B "!r&   c                     | j         S N)r1   r:   s    r'   
manual_dirzDownloadManager.manual_dirm   s
    ~r&   c                 b    t          d | j                                        D                       S )z+Returns the total size of downloaded files.c              3   &   K   | ]}|d          V  dS )	num_bytesNr%   ).0checksums_dicts     r'   	<genexpr>z2DownloadManager.downloaded_size.<locals>.<genexpr>t   s'      mm>>+.mmmmmmr&   )sumr6   valuesr>   s    r'   downloaded_sizezDownloadManager.downloaded_sizeq   s0     mmTEcEjEjElElmmmmmmr&   url_or_urlsdownloaded_path_or_pathsc           	         d}t          t          t          |                                |                                                    |d          D ]0\  }}t	          || j                  | j        t          |          <   1dS )z)Record size/checksum of downloaded files.   zComputing checksums)delaydesc)record_checksumN)hf_tqdmlistzipflattenr   r7   r6   str)r:   rI   rJ   rM   urlr3   s         r'   _record_sizes_checksumsz'DownloadManager._record_sizes_checksumsv   s     [((**,D,L,L,N,NOOPP&
 
 
 	 	IC 8Nd&;8 8 8D*3s8844	 	r&   c           
         | j                                         }d|_        |j        d|_        t	          | j        |          }t          j                    }t                      5  t          ||d|j
        ddd          }ddd           n# 1 swxY w Y   t          j                    |z
  }t                              d	|                                d
z   d           t          |          }t          |          }| j                            t#          t%          |                                |                                                               t          j                    }|                     ||           t          j                    |z
  }t                              d|                                d
z   d           |j        S )ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FNzDownloading datar,   TzDownloading data files)	map_tuplenum_procrN   batched
batch_sizezDownloading took <   z minzChecksum Computation took )r,   copyextract_compressed_filedownload_descr   _download_batchedr   nowr   r   r[   loggerinfototal_secondsr   r8   updatedictrR   rS   rV   data)r:   rI   r,   download_func
start_timerJ   durations          r'   downloadzDownloadManager.download   s   & .335527/(0,>O) 6XXX\^^
9;; 		 		'1(1-( ( ($		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 <>>J.L(>(>(@(@B(FLLLMMM)+66#67O#P#P $$T#k.A.A.C.CE]EeEeEgEg*h*h%i%ijjj\^^
$$[2JKKK<>>J.U1G1G1I1IR1OUUUVVV',,s   &BBBurl_or_filenamesreturnc           	          t          |          dk    rK                                d_        t           j                  }t          |d                   }t          |          rt           j        |          }t          |fi j
        \  }}d}	 |                    |                              dd          }n# t          $ r Y nw xY w|dk     rt          j        nd}t!          ||j        pdd	t$          j                            d
          dk    r6t)          j                    j        rt)          j                    j        d         nd |t.                    S  fd|D             S )N   TrX   r   sizei  @r   Downloadingfiles8HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS1rY   )rN   unitpositionmax_workers
tqdm_classc                 >    g | ]}                     |           S )rX   )_download_single)rC   url_or_filenamer,   r:   s     r'   
<listcomp>z5DownloadManager._download_batched.<locals>.<listcomp>   s<       # %%o%WW  r&   )lenr_   disable_tqdmr   r|   rT   r   r   r5   r   storage_optionsre   get	Exceptionr
   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   ra   r2   environmultiprocessingcurrent_process	_identityr   )r:   rn   r,   rj   r3   fsrr   ry   s   ` `     r'   rb   z!DownloadManager._download_batched   s   
   B&&-2244O+/O(#D$9?[[[M '*++D%% ?'>> II)HIIHBDwwt}}((33    BFARAR==XY   $2Cm:>>"\]]addd#355? e )8::DRHH '       '7   s   )C 
CCr}   c                     t          |          }t          |          rt          | j        |          }t	          ||          }t          |          }|                    |           |S )NrX   )rT   r   r   r5   r   r   
set_origin)r:   r}   r,   outs       r'   r|   z DownloadManager._download_single   sg    o..O,, 	Q.tPPO/?KKK#'''
r&   path_or_bufc                 r    t          |d          rt          j        |          S t          j        |          S )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        read)hasattrr   from_buffrom_urlpath)r:   r   s     r'   iter_archivezDownloadManager.iter_archive   s8    ( ;'' 	="+K888"/<<<r&   pathsc                 *    t          j        |          S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   from_urlpaths)r:   r   s     r'   
iter_fileszDownloadManager.iter_files  s    " *5111r&   c           	         | j                                         }d|_        t          | j        |          }t          |||j        d          }t          |          }t          |          }| j        	                    t          t          |                                |                                                               |j        S )a$  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        TrX   zExtracting data files)r[   rN   )r,   r_   r`   r   r|   r   r[   r   r9   rg   rh   rR   rS   ri   )r:   path_or_pathsr,   extract_funcr9   s        r'   extractzDownloadManager.extract  s    $ .335526/t4oVVV$$-(	
 
 
 ,M::-o>>##D]-B-B-D-DoF]F]F_F_)`)`$a$abbb##r&   c                 R    |                      |                     |                    S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r   rm   )r:   rI   s     r'   download_and_extractz$DownloadManager.download_and_extract6  s"      ||DMM+66777r&   c                 4    | j                                         S r=   )r6   r_   r>   s    r'   get_recorded_sizes_checksumsz,DownloadManager.get_recorded_sizes_checksumsH  s    -22444r&   c                 v   t          | j                                                  t          | j                                                  z
  }t	          | j                                                  D ]D\  }}||v r;t          j                            |          rt          j	        |           | j        |= Ed S r=   )
setr9   rG   r8   rQ   itemsr2   r3   isfileremove)r:   paths_to_deletekeyr3   s       r'   delete_extracted_filesz&DownloadManager.delete_extracted_filesK  s    d299;;<<s4CXC_C_CaCa?b?bbd288::;; 	. 	.IC&&27>>$+?+?&	$(-	. 	.r&   c                 J    | j         j        r|                                  d S d S r=   )r,   delete_extractedr   r>   s    r'   manage_extracted_filesz&DownloadManager.manage_extracted_filesR  s2    0 	*'')))))	* 	*r&   )NNNNT)r   r   r    is_streamingr   rT   r   r;   propertyr?   rH   r   rV   rm   rQ   rb   r|   r   ioBufferedReaderr   r   r   r   r   r   r   r%   r&   r'   r)   r)   G   s       L '+"&48#'!" !"sm!" 3-!" ".1	!"
 C=!" !" !" !"F   X n n Xn3F bu    0- 0- 0-d)s)) () 
c	) ) ) )V n Y\    =c23D.D(E = = = =22c49n 5 2 2 2 2&$ $ $@8 8 8$5 5 5. . .* * * * *r&   r)   ),r!   enumr   r   r2   r   	functoolsr   typingr   r   fsspecfsspec.corer   tqdm.contrib.concurrentr    r
   utilsr   rP   utils.file_utilsr   r   r   r   r   r   utils.info_utilsr   utils.loggingr   utils.py_utilsr   r   utils.trackr   r,   r   r   rd   Enumr   r)   r%   r&   r'   <module>r      s    " !  				     				             " " " " " " " "  ! ! ! ! ! ! . . . . . .       # # # # # #                6 5 5 5 5 5 , , , , , , , , < < < < < < < < % % % % % % + + + + + + 
H		* * * * *49 * * **M* M* M* M* M* M* M* M* M* M*r&   