
    &`i1                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ erd dlZdefdZd	eee	f         fd
Z	 d%dee
ee
         f         d	eee	f         de
fdZdedeeef         fdZ	 	 	 	 d&de	deeeeef                  deeeeef                  dedef
dZeddfdeeee	f                  dedeeeef                  defdZ d'd	eee	f         deeeef                  fdZ!ddiZ"d'd	eee	f         deeeef                  fd Z#d!efd"Z$ G d# d$e          Z%dS )(    N)partial)TYPE_CHECKINGAnyCallableDictListOptionalUnion)iterate_with_retry)BlockAccessor)FileBasedDatasourcepathc                     t          j        d|           }|sdS |                    d          |                    d          fS )zSplit off all file extensions.

    Returns base, allext.

    Args:
        path: path with extensions

    Returns:
        str: path with all extensions removed
    z^((?:.*/|)[^.]+)[.]([^/]*)$)NN      )rematchgroup)r   r   s     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/_internal/datasource/webdataset_datasource.py_base_plus_extr      sB     H3T::E z;;q>>5;;q>>))    samplec                     | duo]t          | t                    oHt          t          |                                                     dk    o|                     dd           S )zUCheck whether a sample is valid.

    Args:
        sample: sample to be checked
    Nr   __bad__F)
isinstancedictlenlistkeysget)r   s    r   _valid_sampler!   %   sc     	d 	-vt$$	-V[[]]##$$q(	- 

9e,,,	r   fdefaultc                     | |S t          | t                    s| g} | D ]/}| t          |          st          ||          } ||          }0|S )a  Apply a list of functions to a sample.

    Args:
        f: function or list of functions
        sample: sample to be modified
        default: default function to be applied to all keys.
            Defaults to None.

    Returns:
        modified sample
    Nformat)r   r   callabler   )r"   r   r#   gs       r   _apply_listr)   3   sm     	ya C  x{{***A6Mr   suffixsuffixesc                     |dS t          |          r ||           S |D ]8}d|v sd|v rt          j        d| z   |          r dS &| |k    s	d| z   |k    r dS 9dS )aC  Check whether a suffix is valid.

    Suffixes can be either None (=accept everything), a callable,
    or a list of patterns. If the pattern contains */? it is treated
    as a glob pattern, otherwise it is treated as a literal.

    Args:
        suffix: suffix to be checked
        suffixes: list of valid suffixes
    NT*?.F)r'   fnmatch)r*   r+   patterns      r   _check_suffixr2   L   s     t  x  '>>SG^^sV|W55 ttw#,'"9"944 #:5r   Ffileobj
fileselect
filerenameverbose_openmetac              #     K   |pi }t          j        | d          }|rt          d|            |D ]}|j        }|                                r| |                    |                                          }t          ||          }t          |t                    sJ t          ||          st          ||          }	|	V  |rt          d|            dS dS )a  Iterate over tar file, yielding filename, content pairs for the given tar stream.

    Args:
        fileobj: file object
        fileselect: patterns or function selecting
            files to be selected
        meta: metadata to be added to each sample
    zr|*)r3   modezstart N)fnamedatazdone )tarfileopenprintnameisregextractfilereadr)   r   strr2   r   )
r3   r4   r5   r6   r7   streamtarinfor:   r;   results
             r   _tar_file_iteratorrG   d   s      :2D\'666F otoo 
 
}} 	%-!!'**//11J..%%%%%%UJ// 	E--- ndnn r   r;   r   c           
   #   .  K   |pi }d}| D ]}t          |t                    sJ |d         |d         }} ||          \  }}	|:|||d         k    rGt          |          r|                    |           |V  t          |          }d|v r|d         |d<   |	|v r5t	          | d|	 d|                                 d	|d          z             |t          |	|          r|||	<   t          |          r|                    |           |V  dS dS )
a7  Return function over iterator that groups key, value pairs into samples.

    Args:
        data: iterator over key, value pairs
        keys: function that returns key, suffix for a given key
        suffixes: list of suffixes to be included in the sample
        meta: metadata to be added to each sample
    Nr:   r;   __key__)rI   __url__z": duplicate file name in tar file  z	, tar is )r   r   r!   update
ValueErrorr   r2   )
r;   r   r+   r7   current_sample
filesampler:   valueprefixr*   s
             r   _group_by_keysrR      s      :2DN + +
*d+++++!'*Jv,>ue>!V~i/H%H%H^,, %%%d+++$$$$!&111NJ&&,6y,Ay)^##<<<OOn1133OOd9oOOP   }VX>>%*N6"^$$ d### r   Tr&   c                    t          |           } |                                 D ]\  }}|                    d          d         }|                    d          r7|dv r|                    d          | |<   T|dv r&t          |                    d                    | |<   ~|dv rdd	l}dd	l}|d
k    r0|j        	                    t          j        |                    | |<   |                    |j        	                    t          j        |                              | |<   |dk    rt          j        |          | |<   #|dk    r0dd	l}|                    t          j        |                    | |<   Y|dk    r dd	l}|                    |d          | |<   |dv r0dd	l}|                    t          j        |                    | |<   |dv rdd	l}	|	                    |          | |<   | S )aT  A default decoder for webdataset.

    This handles common file extensions: .txt, .cls, .cls2,
        .jpg, .png, .json, .npy, .mp, .pt, .pth, .pickle, .pkl.
    These are the most common extensions used in webdataset.
    For other extensions, users can provide their own decoder.

    Args:
        sample: sample, modified in place
    r/   __)txttextutf-8clscls2)jpgpngppmpgmpbmpnmr   NPILjsonnpympF)rawptpthpicklepkl)r   itemssplit
startswithdecodeintnumpy	PIL.ImageImager=   ioBytesIOasarrayrc   loadsloadmsgpackunpackbtorchrk   )
r   r&   keyrP   	extensionnprb   rz   r|   rk   s
             r   _default_decoderr      s    &\\Fllnn !. !.
UIIcNN2&	>>$ 	./)),,w//F3KK/))ell73344F3KKDDD!innRZ->->??s jj
58I8I)J)JKKs&  *U++F3KK%''"*U"3"344F3KK$NNN!//%U/;;F3KK-''LLL**RZ%6%677F3KK+++MMM ,,u--F3KMr   r\   jpegc                 >   t          |           } |                                 D ]w\  }}|                    d          d         }|                    d          r7|dv r|                    d          | |<   T|dv r&t          |                              d          | |<   ~|dv rdd	l}dd	l}t          ||j	                  r|j
                            |          }t          ||j
        j
                  sJ t          j                    }|                    |t                              |                                |          
           |                                | |<   C|dk    r,t'          j        |                              d          | |<   u|dk    rFdd	l}t          j                    }|                    ||           |                                | |<   |dk    rdd	l}|                    |          | |<   |dv rFdd	l}	t          j                    }|	                    ||           |                                | |<   /|dv rDdd	l}
t          j                    }|
                    ||           |                                | |<   y| S )aQ  A default encoder for webdataset.

    This handles common file extensions: .txt, .cls, .cls2, .jpg,
        .png, .json, .npy, .mp, .pt, .pth, .pickle, .pkl
    These are the most common extensions used in webdataset.
    For other extensions, users can provide their own encoder.

    Args:
        sample (Dict[str, Any]): sample
    r/   rT   rU   )rV   rX   rY   )r\   r   r]   r^   r_   r`   ra   r   Nr%   rc   rd   re   rg   rj   )r   rm   rn   ro   encoderC   rr   rs   r   ndarrayrt   	fromarrayru   rv   saveextension_to_formatr    lowergetvaluerc   dumpsrz   r|   rk   dump)r   r&   r}   rP   r~   r   rb   rD   rz   r|   rk   s              r   _default_encoderr      s    &\\Fllnn +, +,
UIIcNN2&	>>$ )	,'!!,,w//F3KK/))e**++G44F3KKLLL%,, 3	++E22eSY_55555Z\\FJJ266y7H7H)TT     !//++F3KK&  *U++227;;F3KK%Z\\FGGFE""" //++F3KK$NNN!--..F3KK-''LLLZ\\FJJuf%%% //++F3KK+++MMMZ\\FKKv&&& //++F3KMr   blockc                 .    |                      d          S )zMake a block iterable.

    This is a placeholder for dealing with more complex blocks.

    Args:
        block: Ray Dataset block

    Returns:
        Iterable[Dict[str,Any]]: Iterable of samples
    F)public_row_format)	iter_rows)r   s    r   _make_iterabler     s     ??U?333r   c                        e Zd ZdZdgZ	 	 	 	 	 	 ddeeee         f         deee	ee
ef                  deee	e
ef                  d	eee	e
ef                  d
eee	e
ef                  de	de	f fdZdddefdZ xZS )WebDatasetDatasourcezJA Datasource for WebDataset datasets (tar format with naming conventions).tarTNFpathsdecoderr4   r5   r+   r6   expand_jsonc                      t                      j        |fi | || _        || _        || _        || _        || _        || _        d S N)super__init__r   r4   r5   r+   r6   r   )
selfr   r   r4   r5   r+   r6   r   file_based_datasource_kwargs	__class__s
            r   r   zWebDatasetDatasource.__init__1  sW     	??">???$$ (&r   rD   zpyarrow.NativeFiler   c              #   |   K   ddl } fd}t          |d j        j                  }t	          |t          |           j                  }|D ]h} j        t           j        |t                    } j
        rt          |d	         t                    r.t          j        |d	                             d
                    }nt          |d	         t                     rt          j        |d	                   }nJt          |d	         t
                    r	|d	         }n&t#          dt%          |d	                    d          |                                D ])\  }	}
|	|vrg ||	<   ||	                             |
           *|                    d |                                D                       V  jdS )a  Read and decode samples from a stream.

        Note that fileselect selects files during reading, while suffixes
        selects files during the grouping step.

        Args:
            stream: File descriptor to read from.
            path: Path to the data.
            decoder: decoder or list of decoders to be applied to samples
            fileselect: Predicate for skipping files in tar decoder.
                Defaults to lambda_:False.
            suffixes: List of suffixes to be extracted. Defaults to None.
            verbose_open: Print message when opening files. Defaults to False.

        Yields:
            List[Dict[str, Any]]: List of sample (list of length 1).
        r   Nc                  H    t           j         j         j                  S )N)r4   r5   r6   )rG   r4   r5   r6   )r   rD   s   r   get_tar_file_iteratorz@WebDatasetDatasource._read_stream.<locals>.get_tar_file_iteratorZ  s-    %??!.	   r   ziterate tar file)r   )rJ   )r7   r+   )r#   rc   rX   zUnsupported data type z for samplec                 p    i | ]3\  }}|t          |t                    rt          |          d k    r|n|g4S )r   )r   r   r   ).0kvs      r   
<dictcomp>z5WebDatasetDatasource._read_stream.<locals>.<dictcomp>}  sQ       1 Jq$//HCFFaKKqqaS  r   )pandasr   _data_contextretried_io_errorsrR   r   r+   r   r)   r   r   r   bytesrc   rx   rp   rC   	TypeErrortyperm   append	DataFrame)r   rD   r   pdr   filessamplesr   parsed_jsonr   r   s   ``         r   _read_streamz!WebDatasetDatasource._read_streamE  s     & 		 	 	 	 	 	 #!$6
 
 
 !T$-?-?-?$-XXX 	 	F|'$T\6CSTTT (fVne44 	"&*VF^-B-B7-K-K"L"LKKv44 "&*VF^"<"<KKv55 "(.KK#VT&.5I5IVVV   (--// ( (DAq$&q	1I$$Q'''',,  &       %	 	r   )TNNNFF)__name__
__module____qualname____doc___FILE_EXTENSIONSr
   rC   r   r	   boolr'   r   r   r   __classcell__)r   s   @r   r   r   ,  s       TTw
 ?C<@<@:>"!' 'S$s)^$' %c8T 9:;' U44#789	'
 U44#789' 5x!567' ' ' ' ' ' ' '(<#7 <s < < < < < < < <r   r   r   )NNFN)T)&r0   ru   rc   r   r<   	functoolsr   typingr   r   r   r   r   r	   r
   ray.data._internal.utilr   ray.data.blockr   )ray.data.datasource.file_based_datasourcer   pyarrowrC   r   r!   r)   r   r'   r2   r   r   rG   rR   r   r   r   r   r    r   r   <module>r      s8    				  				        L L L L L L L L L L L L L L L L L L 6 6 6 6 6 6 ( ( ( ( ( ( I I I I I I NNN* * * * *"$sCx.     UY XtH~%&04S#XIQ   2# tX~)>    4 9=8< tXt345 tXt345 	
    H $04	& &
tCH~
&
& uT8^,-& 	& & & &R. .T#s(^ .XeD#I>N5O . . . .b fo 8 8T#s(^ 8XeCI>N5O 8 8 8 8v4- 4 4 4 4U U U U U. U U U U Ur   