
    &`iH                        d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ erd dlZ ej         e!          Z"e G d
 d                      Z#e G d de#                      Z$e G d de$                      Z%e G d de%                      Z&de'dee(e	e(         f         de(fdZ)de	e(         ddde
e         de
e	e(                  de	ee(e*f                  f
dZ+de	e(         ddde
e         de
e	e(                  deee(e*f                  f
dZ,	 d0de	e(         ddde
e         de-deee(e*f                  f
dZ.	 d0de	e(         ddde-deee(e*f                  fd Z/	 d0de	e(         d!e(dd"de-deee(e*f                  f
d#Z0	 d0de	e(         ddde-deee(e*f                  fd$Z1 ed%          Z2 ed&          Z3d'e	e2         d(ee	e2         ge	e3         f         d)e*dee3         fd*Z4	 d0d+e(ddd,e-de	ee(e*f                  fd-Z5	 	 d1d+e(ddd.e
e	e(                  d,e-de	ee(e*f                  f
d/Z6dS )2    N)TYPE_CHECKINGCallableIteratorListOptionalTupleTypeVarUnion)ProgressBar)cached_remote_fn)RetryingPyFileSystem)BlockMetadata)PartitioningPathPartitionFilter)_has_file_extension)DeveloperAPIc                   J    e Zd ZdZdee         defdZdee         defdZdS )FileMetadataProviderzAbstract callable that provides metadata for the files of a single dataset block.

    Current subclasses:
        - :class:`BaseFileMetadataProvider`
    pathsreturnc                     t           )az  Resolves and returns block metadata for files in the given paths.

        All file paths provided should belong to a single dataset block.

        Args:
            paths: The file paths for a single dataset block.
            **kwargs: Additional kwargs used to determine block metadata.

        Returns:
            BlockMetadata aggregated across the given paths.
        NotImplementedErrorselfr   kwargss      z/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/data/datasource/file_meta_provider.py_get_block_metadataz(FileMetadataProvider._get_block_metadata*   s
      "!    c                      | j         |fi |S N)r   r   s      r   __call__zFileMetadataProvider.__call__<   s    
 (t'88888r   N)	__name__
__module____qualname____doc__r   strr   r   r"    r   r   r   r   "   sr         "Cy" 
	" " " "$9Cy9 
	9 9 9 9 9 9r   r   c                       e Zd ZdZdee         dee         deee                  defdZ		 	 ddee         d	ed
         dee
         dedeeeef                  f
dZdS )BaseFileMetadataProvideram  Abstract callable that provides metadata for
    :class:`~ray.data.datasource.file_based_datasource.FileBasedDatasource`
    implementations that reuse the base :meth:`~ray.data.Datasource.prepare_read`
    method.

    Also supports file and file size discovery in input directory paths.

    Current subclasses:
        - :class:`DefaultFileMetadataProvider`
    r   rows_per_file
file_sizesr   c                    t           )a  Resolves and returns block metadata for files of a single dataset block.

        Args:
            paths: The file paths for a single dataset block. These
                paths will always be a subset of those previously returned from
                :meth:`.expand_paths`.
            rows_per_file: The fixed number of rows per input file, or None.
            file_sizes: Optional file size per input file previously returned
                from :meth:`.expand_paths`, where `file_sizes[i]` holds the size of
                the file at `paths[i]`.

        Returns:
            BlockMetadata aggregated across the given file paths.
        r   )r   r   r+   r,   s       r   r   z,BaseFileMetadataProvider._get_block_metadataQ   s
    * "!r   NF
filesystemr   partitioningignore_missing_pathsc                     t           )a  Expands all paths into concrete file paths by walking directories.

        Also returns a sidecar of file sizes.

        The input paths must be normalized for compatibility with the input
        filesystem prior to invocation.

        Args:
            paths: A list of file and/or directory paths compatible with the
                given filesystem.
            filesystem: The filesystem implementation that should be used for
                expanding all paths and reading their files.
            ignore_missing_paths: If True, ignores any file paths in ``paths`` that
                are not found. Defaults to False.

        Returns:
            An iterator of `(file_path, file_size)` pairs. None may be returned for the
            file size if it is either unknown or will be fetched later by
            `_get_block_metadata()`, but the length of
            both lists must be equal.
        r   r   r   r.   r/   r0   s        r   expand_pathsz%BaseFileMetadataProvider.expand_pathsh   s
    8 "!r   NFr#   r$   r%   r&   r   r'   r   intr   r   r   boolr   r   r3   r(   r   r   r*   r*   D   s        	 	"Cy"  }	"
 #'" 
" " " "6 04%*" "Cy" 34" |,	"
 #" 
%S/	"" " " " " "r   r*   c                       e Zd ZdZdee         dee         deee                  defdZ		 	 ddee         d	d
dee
         dedeeeef                  f
dZdS )DefaultFileMetadataProvidera,  Default metadata provider for
    :class:`~ray.data.datasource.file_based_datasource.FileBasedDatasource`
    implementations that reuse the base `prepare_read` method.

    Calculates block size in bytes as the sum of its constituent file sizes,
    and assumes a fixed number of rows per file.
    r   r+   r,   r   c                    |d }nt          |          |z  }t          |d |v rd nt          t          |                    |d           S )N)num_rows
size_bytesinput_files
exec_stats)lenr   r6   sum)r   r   r+   r,   r;   s        r   r   z/DefaultFileMetadataProvider._get_block_metadata   s`      HH5zzM1H#z11tts3z??7K7K	
 
 
 	
r   NFr.   r   r/   r0   c              #   :   K   t          ||||          E d {V  d S r!   )_expand_pathsr2   s        r   r3   z(DefaultFileMetadataProvider.expand_paths   s5       !
LBVWWWWWWWWWWWr   r4   r5   r(   r   r   r9   r9      s         
Cy
  }	

 #'
 

 
 
 
, 04%*X XCyX +X |,	X
 #X 
%S/	"X X X X X Xr   r9   c                   h    e Zd ZdZ	 	 ddee         dddee         ded	e	e
eef                  f
d
ZdS )FastFileMetadataProvidera  Fast Metadata provider for
    :class:`~ray.data.datasource.file_based_datasource.FileBasedDatasource`
    implementations.

    Offers improved performance vs.
    :class:`DefaultFileMetadataProvider`
    by skipping directory path expansion and file size collection.
    While this performance improvement may be negligible for local filesystems,
    it can be substantial for cloud storage service providers.

    This should only be used when all input paths exist and are known to be files.
    NFr   r.   r   r/   r0   r   c           	   #      K   |rt          d          t                              dt          |           d           t	          |t          j        d t          |                              E d {V  d S )Nz`ignore_missing_paths` cannot be set when used with `FastFileMetadataProvider`. All paths must exist when using `FastFileMetadataProvider`.zSkipping expansion of z path(s). If your paths contain directories or if file size collection is required, try rerunning this read with `meta_provider=DefaultFileMetadataProvider()`.)
ValueErrorloggerwarningr?   zip	itertoolsrepeatr2   s        r   r3   z%FastFileMetadataProvider.expand_paths   s         	4   	HSZZ H H H	
 	
 	
 ui.tSZZ@@AAAAAAAAAAAr   r4   )r#   r$   r%   r&   r   r'   r   r   r7   r   r   r6   r3   r(   r   r   rD   rD      s         " 04%*B BCyB +B |,	B
 #B 
%S/	"B B B B B Br   rD   errorr   r   c                     d}t          j        |t          |                     r.t          |t                    rd| d}t	          d| d          | )Nz^(?:(.*)AWS Error \[code \d+\]: No response body\.(.*))|(?:(.*)AWS Error UNKNOWN \(HTTP status 400\) during HeadObject operation: No response body\.(.*))|(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response body\.(.*))$"z Failing to read AWS S3 file(s): ak  . Please check that file exists and has properly configured access. You can also run AWS CLI command to get more detailed error message (e.g., aws s3 ls <file-name>). See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html and https://docs.ray.io/en/latest/data/creating-datasets.html#reading-from-remote-storage for more information.)rematchr'   
isinstanceOSError)rL   r   aws_error_patterns      r   _handle_read_os_errorrT      s{    
	  
x!3u::..  eS!! 	! !LLLE(5 ( ( (

 

 
	
 r   r.   r   partition_filterfile_extensionsc                B    t          t          | |||                    S )N)rU   rV   )list_list_files_internal)r   r.   rU   rV   s       r   _list_filesrZ      s4     -+		
 	
 	
  r   c             #      K   t                      }|                    | |          D ]3\  }}|r|                    |          rt          ||          s-||fV  4d S r!   )r9   r3   applyr   )r   r.   rU   rV   default_meta_providerpath	file_sizes          r   rY   rY     s       8990==eZPP  i
 	$**400	 't_==	
 Io r   Fr/   r0   c              #   6  	K   ddl m} ddlm} ddlm}m} t          ||          }t          |t                    r"t          |	                                |          }t          |           |k     s|rt          | ||          E d{V  dS t          j                            |           	 |	          sK|	 ||j                  k    st!          	fd| D                       rt#          | 	||          E d{V  dS t%          | ||          E d{V  dS )z/Get the file sizes for all provided file paths.r   )LocalFileSystem))FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD)_is_http_url_unwrap_protocolNc              3   l   K   | ].}t          t          j        |          j                  k    V  /d S r!   )r'   pathlibPathparent).0r^   common_paths     r   	<genexpr>z _expand_paths.<locals>.<genexpr>J  s=      SST3w|D))011[@SSSSSSr   )
pyarrow.fsra   )ray.data.datasource.file_based_datasourcerb   ray.data.datasource.path_utilrc   rd   rQ   r   unwrapr?   _get_file_infos_serialosr^   
commonpathbase_dirall"_get_file_infos_common_path_prefix_get_file_infos_parallel)
r   r.   r/   r0   ra   rb   rc   rd   is_localrj   s
            @r   rB   rB     s      +*****      MLLLLLLL *o66H*233 Dj//11?CC 	E

>>> 	? *%=QRRRRRRRRRRR g((// |K(( 	Y(#3#3L4I#J#JJJSSSSUSSSSS K :{J0D           0zCWXXXXXXXXXXXr   c              #   B   K   | D ]}t          |||          E d {V  d S r!   _get_file_infos)r   r.   r0   r^   s       r   rp   rp   U  sM      
  K K"45IJJJJJJJJJJK Kr   rj   zpyarrow.fs.FileSystemc              #     K   d | D             }t          |||          D ]\  }}||v r|||<   d}| D ],}||         "t                              d| d           d} n-|rt          | ||          E d {V  d S | D ]}|||         fV  d S )Nc                     i | ]}|d S r!   r(   )ri   r^   s     r   
<dictcomp>z6_get_file_infos_common_path_prefix.<locals>.<dictcomp>d  s    1114D$111r   FzFinding path zX not have file size metadata. Fall back to get files metadata in parallel for all paths.T)rz   rG   debugrv   )r   rj   r.   r0   path_to_sizer^   r_   have_missing_paths           r   ru   ru   ^  s%      215111L*Z!5  + +i <!*L
   %LLM M M M   !%E &  	++E:?STTTTTTTTTTT  	+ 	+DT******	+ 	+r   c              #   :  K   ddl m}mm} t                              dt          |            d            |          dt          t                   dt          t          t          t          f                  ffd}t          | ||          E d {V  d S )Nr   )PATHS_PER_FILE_SIZE_FETCH_TASK#_unwrap_s3_serialization_workaround!_wrap_s3_serialization_workaroundz
Expanding z path(s). This may be a HIGH LATENCY operation on some cloud storage services. Moving all the paths to a common parent directory will lead to faster metadata fetching.r   r   c                                t          t          j                            fd| D                                 S )Nc              3   :   K   | ]}t          |          V  d S r!   ry   )ri   r^   fsr0   s     r   rk   zH_get_file_infos_parallel.<locals>._file_infos_fetcher.<locals>.<genexpr>  sA       * *DHb*>??* * * * * *r   )rX   rJ   chainfrom_iterable)r   r   r   r.   r0   s    @r   _file_infos_fetcherz5_get_file_infos_parallel.<locals>._file_infos_fetcher  sf    00<<O)) * * * * *LQ* * *  
 
 	
r   )rm   r   r   r   rG   rH   r?   r   r'   r   r6   _fetch_metadata_parallel)r   r.   r0   r   r   r   r   s    ``   @r   rv   rv     s     
          NN	SZZ 	 	 	   32:>>J
49 
eCHo1F 
 
 
 
 
 
 
 
 ("$B          r   UriMetauris
fetch_funcdesired_uris_per_taskc              +     K   t          |          }|r |j        di |}t          t          |           |z  d          }t	          d|d          }g }t          j        | |          D ]>}t          |          dk    r|                    |                    |                     ?|	                    |          }	t          j                            |	          E d{V  dS )z0Fetch file metadata in parallel using Ray tasks.   zMetadata Fetch Progresstask)totalunitr   Nr(   )r   optionsmaxr?   r   nparray_splitappendremotefetch_until_completerJ   r   r   )
r   r   r   ray_remote_argsremote_fetch_funcparallelismmetadata_fetch_barfetch_tasks	uri_chunkresultss
             r   r   r     s      )44 I5-5HHHH c$ii#88!<<K$!6   K^D+66 @ @	y>>Q,33I>>???? 55kBBG,,W55555555555r   r^   ignore_missing_pathc                    ddl m} g }	 |                    |           }n'# t          $ r}t	          ||            Y d}~nd}~ww xY w|j        |j        k    r.t          | |          D ]\  }}|                    ||f           nO|j        |j	        k    r|                    | |j
        f           n"|j        |j        k    r|rnt          |           |S )z>Get the file info for all files at or under the provided path.r   )FileTypeN)rl   r   get_file_inforR   rT   type	Directory_expand_directoryr   FilesizeNotFoundFileNotFoundError)	r^   r.   r   r   
file_infos	file_infoe	file_pathr_   s	            r   rz   rz     s    $#####J',,T22		 ' ' 'a&&&&&&&&'~+++$5dJ$G$G 	6 	6 Iyy)45555	6	8=	(	(401111	8,	,	,1D	,%%%s     
A?Aexclude_prefixesc                    |ddg}ddl m}  || d|          }|                    |          }|j        }g }|D ]v}	|	j        s
|	j        }
|
                    |          s'|
t          |          d         t          fd|D                       rZ|	                    |
|	j
        f           wt          |          S )	a  
    Expand the provided directory path to a list of file paths.

    Args:
        path: The directory path to expand.
        filesystem: The filesystem implementation that should be used for
            reading these files.
        exclude_prefixes: The file relative path prefixes that should be
            excluded from the returned file set. Default excluded prefixes are
            "." and "_".

    Returns:
        An iterator of (file_path, file_size) tuples.
    N._r   )FileSelectorT)	recursiveallow_not_foundc              3   B   K   | ]}                     |          V  d S r!   )
startswith)ri   prefixrelatives     r   rk   z$_expand_directory.<locals>.<genexpr>  s1      JJvx""6**JJJJJJr   )rl   r   r   rs   is_filer^   r   r?   anyr   r   sorted)r^   r.   r   r   r   selectorfiles	base_pathoutfile_r   r   s              @r   r   r     s   ( :''''''|DDBUVVVH$$X..E!I
C 	, 	,} 	J	##I.. 	S^^--.JJJJ9IJJJJJ 	

Iuz*++++#;;r   )Fr4   )7rJ   loggingrq   rf   rO   typingr   r   r   r   r   r   r	   r
   numpyr   ray.data._internal.progress_barr   ray.data._internal.remote_fnr   ray.data._internal.utilr   ray.data.blockr    ray.data.datasource.partitioningr   r   rn   r   ray.util.annotationsr   pyarrow	getLoggerr#   rG   r   r*   r9   rD   rR   r'   rT   r6   rZ   rY   r7   rB   rp   ru   rv   r   r   r   rz   r   r(   r   r   <module>r      s        				  					 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	     7 7 7 7 7 7 9 9 9 9 9 9 8 8 8 8 8 8 ( ( ( ( ( ( N N N N N N N N = = = = = = - - - - - - NNN 
	8	$	$ 9 9 9 9 9 9 9 9B ?" ?" ?" ?" ?"3 ?" ?" ?"D "X "X "X "X "X": "X "X "XJ "B "B "B "B "B: "B "B "BJ   sDI~1F  3        F9& 23	
 d3i( 
%S/   "9& 23	
 d3i( eCHo   6 "'	4Y 4Y94Y&4Y <(4Y 	4Y
 eCHo4Y 4Y 4Y 4Yt "'K K9K&K K eCHo	K K K K "'	#+ #+9#+#+ (#+ 	#+
 eCHo#+ #+ #+ #+R "'   9 &    eCHo	       F gennwv6
s)6$s)d4j016 6
 d^6 6 6 66 PU 
1HL	%S/   6 -1 %	( (
(&( tCy)( 	(
 
%S/( ( ( ( ( (r   