
    Pi)                        d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	Z
d dlZd dlZd dlmZ d dlmZ d dlmZmZ ej        j                            e          Z G d dej                  Zd	 Zg d
Zee_        g dZee_        g dZee_        de fdZ!de fdZ"de fdZ#de fdZ$de fdZ%de fdZ&de fdZ'i de!de!de!de(de(de(de(de(dej)        dej)        de"d e"d!e#d"e#d#e$d$e%d%e&d&e'iZ*e*e_*        dS )'    N)islice)AnyCallable)Key)cast_to_python_objects)-SINGLE_FILE_COMPRESSION_EXTENSION_TO_PROTOCOL	xbasenamec                       e Zd ZU dZee         ed<   ee         ed<   ee         ed<   eeee	ge	f         f         ed<   dZ
ed             Zdej        fd	Zd
 Zd Zd ZdS )
WebDatasetd   IMAGE_EXTENSIONSAUDIO_EXTENSIONSVIDEO_EXTENSIONSDECODERS   c              #     K   i }t          j        d          }t          j                    }|D ]\  }}t	          |          \  }}	||rB|d         |k    r6|                    d          |d<   |                    d          |d<   |V  i }||d<   ||d<   |                                ||	<   |	                    d          d                                         t          v r|
                    |||	                    |                    d|           }
t          j        |
          5 }|                                ||	<   d d d            n# 1 swxY w Y   |                    |           t          |
                              d          d                                         }n-|	                    d          d                                         }|| j        v r | j        |         ||	                   ||	<   |r|V  d S d S )Nmemory__key____url__.z	memory://)fsspec
filesystemdatasetsStreamingDownloadManagerbase_plus_extpopreadsplitlowerr   write_bytesextractopendeleter	   r   )clstar_pathtar_iteratorcurrent_examplefsstreaming_download_managerfilenamefexample_key
field_nameextracted_file_pathdata_extensions               /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/packaged_modules/webdataset/webdataset.py_get_pipeline_from_tarz!WebDataset._get_pipeline_from_tar   sg     (.(9((C(C%-%F%H%H"' 	h 	hKHa&3H&=&=#K" %?9#=#L#L-<-@-@-K-K	*-<-@-@-K-K	*%%%%"$)4OI&)1OI&*+&&((OJ'$$R(..004aaax)DEEE&@&H&HI_U]I_I_&`&`#[!455 ;23&&((OJ/; ; ; ; ; ; ; ; ; ; ; ; ; ; ;		(###!*+>!?!?!E!Ec!J!J2!N!T!T!V!V!+!1!1#!6!6r!:!@!@!B!B--.Jcl>.J?[eKf.g.g
+ 	"!!!!!!	" 	"s   *EE	E	returnc                 (    t          j                    S N)r   DatasetInfo)selfs    r1   _infozWebDataset._info<   s    #%%%    c           	      
   | j         j        st          d| j         j                                       | j         j                  }g }|                                D ]?\  }}fd|D             }|                    t          j        |||d                     @| j        j	        sV| 
                    |d         |d                   }t          t          || j                            t          fdD                       rt          d          d D             }t          j        |d	
          j        }	t          j                            |	          }
d         D ]}|                    dd          d                                         }|| j        v rt          j                    |
|<   || j        v rt          j                    |
|<   || j        v rt          j                    |
|<   |
| j        _	        |S )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=c                 :    g | ]}                     |          S  )iter_archive).0r&   
dl_managers     r1   
<listcomp>z0WebDataset._split_generators.<locals>.<listcomp>G   s'    YYY8Z44X>>YYYr9   )	tar_pathstar_iterators)name
gen_kwargsr   c              3   x   K   | ]4}|                                 d                                           k    V  5dS )r   N)keys)r>   examplefirst_exampless     r1   	<genexpr>z/WebDataset._split_generators.<locals>.<genexpr>Q   sA      \\'7<<>>^A%6%;%;%=%==\\\\\\r9   zThe TAR archives of the dataset should be in WebDataset format, but the files in the archive don't share the same prefix or the same types.c                 l    g | ]1}t           j                            t          |gd                     2S )T)only_1d_for_numpy)paTablefrom_pylistr   )r>   rG   s     r1   r@   z0WebDataset._split_generators.<locals>.<listcomp>V   sI        $$%;WIY]%^%^%^__  r9   default)promote_optionsr      r   )config
data_files
ValueErrordownloaditemsappendr   SplitGeneratorinfofeaturesr2   listr   #NUM_EXAMPLES_FOR_FEATURES_INFERENCEanyrL   concat_tablesschemaFeaturesfrom_arrow_schemarsplitr    r   Imager   Audior   Video)r7   r?   rS   splits
split_namerA   rB   pipeline	pa_tablesinferred_arrow_schemarZ   r.   	extensionrH   s    `           @r1   _split_generatorszWebDataset._split_generators?   sE    {% 	wu]a]h]suuvvv(()?@@
%/%5%5%7%7 	 	!J	YYYYyYYYMMM'#iZg0h0h     
 y! 	*229Q<qAQRRH!&43["\"\]]N\\\\^\\\\\  b   -  I %'$4YPY$Z$Z$Z$a!(::;PQQH,Q/ 
< 
<
&--c155b9??AA	 555+3>+;+;HZ( 555+3>+;+;HZ( 555+3>+;+;HZ(!)DIr9   c              #      K   |E d {V  d S r5   r<   )r7   rA   rB   s      r1   _generate_shardszWebDataset._generate_shardsl   s$      r9   c              #   6  K   d | j         j                                        D             }d | j         j                                        D             }t          | j         j                                                  }t          t          ||                    D ]}\  }\  }}t          |                     ||                    D ]Q\  }	}
|D ]}||
vrd |
|<   ||z   D ]$}|
|         |
d         dz   |z   |
|         d|
|<   %t          ||	          |
fV  R~d S )Nc                 L    g | ]!\  }}t          |t          j                  |"S r<   )
isinstancer   rc   r>   r.   features      r1   r@   z1WebDataset._generate_examples.<locals>.<listcomp>p   A     
 
 
.:wPZ[bdldrPsPs

 
 
r9   c                 L    g | ]!\  }}t          |t          j                  |"S r<   )rq   r   rd   rr   s      r1   r@   z1WebDataset._generate_examples.<locals>.<listcomp>s   rt   r9   r   r   )pathbytes)	rY   rZ   rV   r[   rF   	enumeratezipr2   r   )r7   rA   rB   image_field_namesaudio_field_namesall_field_namestar_idxr&   r'   example_idxrG   r.   s               r1   _generate_exampleszWebDataset._generate_exampleso   s~     
 
26)2D2J2J2L2L
 
 

 
26)2D2J2J2L2L
 
 
 ty16688991:3y-;X;X1Y1Y 	9 	9-G-h(1$2M2MhXd2e2e(f(f 
9 
9$W"1 3 3J!00.2
+"36G"G  Jz*6$+I$6$<z$I%,Z%8/ /
+ ';//88888
9	9 	9r9   N)__name__
__module____qualname__DEFAULT_WRITER_BATCH_SIZEr[   str__annotations__dictr   r   r\   classmethodr2   r   r6   r8   rl   rn   r   r<   r9   r1   r   r      s          #3i3i3i3#,,----*+'" " [">&x+ & & & &+ + +Z  9 9 9 9 9r9   r   c                     t          j        d|           }|sdS |                    d          |                    d          fS )z>Split off all file extensions.

    Returns base, allext.
    z^((?:.*/|)[^.]+)[.]([^/]*)$)NNrQ      )rematchgroup)rv   r   s     r1   r   r      sB    
 H3T::E z;;q>>5;;q>>))r9   )?blpbmpdibbufrcurpcxdcxddspsepsfitfitsfliflcftcftugbrgifgribh5hdfpngapngjp2j2kjpcjpfjpxj2cicnsicoimiimtiftiffjfifjpejpgjpegmpgmpegmsppcdpxrpbmpgmppmpnmpsdbwrgbrgbasgirastgaicbvdavstwebpwmfemfxbmxpm)aiffauavrcafflachtksvxmat4mat5mpc2koggpafpvfrawrf64sd2sdsircamvocw64wavnistwavexwveximp3opus)z.mkvz.mp4z.aviz.mpegz.movdatac                 ,    |                      d          S )Nzutf-8)decoder   s    r1   
text_loadsr     s    ;;wr9   c                 8    ddl m} |                    |           S )NrQ   )_tenbin) r   decode_buffer)r   r   s     r1   tenbin_loadsr     s(      &&&r9   c                 4    dd l }|                    |           S Nr   )msgpackunpackb)r   r   s     r1   msgpack_loadsr   #  s    NNN??4   r9   c                 t    dd l }t          j        |           }|j        j                            |d          S )Nr   Fallow_pickle)numpy.lib.formatioBytesIOlibformat
read_array)r   numpystreams      r1   	npy_loadsr  )  s;    ZF9&&vE&BBBr9   c                 R    t          j        t          j        |           d          S )NFr   )nploadr   r   r   s    r1   	npz_loadsr  0  s!    72:d##%8888r9   c                 4    dd l }|                    |           S r   )cborloads)r   r  s     r1   
cbor_loadsr	  4  s    KKK::dr9   c                 \    dd l }|                    t          j        |           d          S )Nr   T)weights_only)torchr  r   r   )r   r  s     r1   torch_loadsr  :  s+    LLL::bj&&T::::r9   txttext
transcriptr%   cls2indexinxidjsonjsntentbmpmsgnpynpzr  pth)+r   r  r   	itertoolsr   typingr   r   r   r   r  pyarrowrL   r   datasets.builderr   datasets.features.featuresr   datasets.utils.file_utilsr   r	   utilslogging
get_loggerr   loggerGeneratorBasedBuilderr   r   r   r   r   rw   r   r   r   r  r  r	  r  intr  r   r<   r9   r1   <module>r*     s   				  				                                             = = = = = = ^ ^ ^ ^ ^ ^ ^ ^ 
		*	*8	4	4n9 n9 n9 n9 n9/ n9 n9 n9d* * *.@ @ @ B /
     : /
     /
  U        'u ' ' ' '! ! ! ! !CE C C C C9E 9 9 9 9U    ;e ; ; ; ;	:
J * 
3	
 C S 
3 	# DJ 
4: 
< 	, 	- 
= 
9  
9!" J#$ 
;% ( 
   r9   