
    Pi2                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZ ddlmZ ddlmZ dd	lmZ  ee          Z G d
 d          Z G d de
          Z G d dee
          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z  G d de          Z! G d de          Z" G d de          Z# G d  d!e          Z$ G d" d#          Z%dS )$    N)ABCabstractmethod)Path)OptionalUnion   )config   )FileLock)
get_loggerc                   b    e Zd Zddee         fdZdedefdZdededefd	ZddededefdZ	dS )ExtractManagerN	cache_dirc                     |r*t           j                            |t          j                  nt          j        | _        t          | _        d S N)	ospathjoinr	   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr   s     j/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/utils/extract.py__init__zExtractManager.__init__   s;    FOsBGLLF$ABBBU[Us 	 #    r   returnc                     ddl m} t          j                            |          }t          j                            | j         ||                    S )Nr
   )hash_url_to_filename)
file_utilsr    r   r   abspathr   r   )r   r   r    abs_paths       r   _get_output_pathzExtractManager._get_output_path   sN    444444 7??4((w||D,.B.B8.L.LMMMr   output_pathforce_extractc                     |pSt           j                            |           o3t           j                            |          ot          j        |           S r   )r   r   isfileisdirlistdir)r   r%   r&   s      r   _do_extractzExtractManager._do_extract%   sK     
{+++lRW]];5O5O5kTVT^_jTkTk0l	
r   F
input_pathc                     | j                             |          }|s|S |                     |          }|                     ||          r| j                             |||           |S r   )r   infer_extractor_formatr$   r+   extract)r   r,   r&   extractor_formatr%   s        r   r/   zExtractManager.extract*   ss    >@@LL 	++J77K77 	NN"":{<LMMMr   r   F)
__name__
__module____qualname__r   strr   r$   boolr+   r/    r   r   r   r      s        # #(3- # # # #NS NS N N N N
s 
4 
D 
 
 
 

 # d s      r   r   c                       e Zd Zeedeeef         defd                        Z	e
edeeef         deeef         ddfd                        ZdS )BaseExtractorr   r   c                     d S r   r7   clsr   kwargss      r   is_extractablezBaseExtractor.is_extractable5   s    GJsr   r,   r%   Nc                     d S r   r7   )r,   r%   s     r   r/   zBaseExtractor.extract9   s    VYVYr   )r2   r3   r4   classmethodr   r   r   r5   r6   r>   staticmethodr/   r7   r   r   r9   r9   4   s        J%c	"2JJJJ ^ [JYE$),Y5s;KYPTYYY ^ \YYYr   r9   c                       e Zd ZU g Zee         ed<   edee	e
f         defd            Zed
dee	e
f         dedefd            Zd	S )MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                     t          | d          5 }|                    |          cd d d            S # 1 swxY w Y   d S )Nrb)openread)r   rE   fs      r   read_magic_numberz*MagicNumberBaseExtractor.read_magic_numberA   s    $ 	/66-..	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/s   377r   magic_numberr   c                     sGt          d | j        D                       }	 |                     ||          n# t          $ r Y dS w xY wt	          fd| j        D                       S )Nc              3   4   K   | ]}t          |          V  d S r   )len).0cls_magic_numbers     r   	<genexpr>z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>I   s,      %f%f@Pc*:&;&;%f%f%f%f%f%fr   Fc              3   B   K   | ]}                     |          V  d S r   )
startswith)rP   rQ   rL   s     r   rR   z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>N   s3      ggAQ<**+;<<ggggggr   )maxrD   rK   OSErrorany)r<   r   rL   rE   s     ` r   r>   z'MagicNumberBaseExtractor.is_extractableF   s     	"%%f%fTWTe%f%f%f"f"f"44T;NOO   uuggggUXUfggggggs   : 
AANr   )r2   r3   r4   rD   listbytes__annotations__rA   r   r   r5   intrK   r@   r6   r>   r7   r   r   rC   rC   >   s         !#M4;###/dCi 0 /s / / / \/ h h%c	"2 h% hRV h h h [h h hr   rC   c                       e Zd Zedeeef         defd            Ze	d             Z
e	deeef         deeef         ddfd            ZdS )	TarExtractorr   r   c                 *    t          j        |          S r   )tarfile
is_tarfiler;   s      r   r>   zTarExtractor.is_extractableR   s    !$'''r   c              #   P  K   dt           dt           fddt           dt           dt          ffddt           dt          ffd} |          }| D ]} |j        |          r$t                              d|j         d           7|                                r7 |||          r+t                              d|j         d	|j                    |                                r7 |||          r+t                              d|j         d
|j                    |V  dS )a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r   c                 z    t           j                            t           j                            |                     S r   )r   r   realpathr"   )r   s    r   resolvedz*TarExtractor.safemembers.<locals>.resolvedb   s&    7##BGOOD$9$9:::r   basec                 ~     t           j                            ||                                         |           S r   )r   r   r   rT   )r   rf   re   s     r   badpathz)TarExtractor.safemembers.<locals>.badpathe   s4    xT4 8 899DDTJJJJr   c                      t           j                            |t           j                            | j                                      } | j        |          S )N)rf   )r   r   r   dirnamenamelinkname)inforf   tiprh   re   s      r   badlinkz)TarExtractor.safemembers.<locals>.badlinki   sI    (27<<bgoodi.H.HIIJJC74=s3333r   zExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r5   r6   rk   loggererrorissymrl   islnk)membersr%   ro   rf   finforh   re   s        @@r   safememberszTarExtractor.safemembersV   s     	;3 	;3 	; 	; 	; 	;	K# 	KS 	KT 	K 	K 	K 	K 	K 	K	4 	4 	4 	4 	4 	4 	4 	4 	4
 x$$ 	 	Ewuz4(( TejTTTUUUU 775$#7#7 bejbbRWR`bbcccc 775$#7#7 dejddTYTbddeeee	 	r   r,   r%   Nc                     t          j        |d           t          j        |           }|                    |t
                              ||                     |                                 d S )NTexist_ok)rt   )r   makedirsr`   rH   
extractallr^   rv   close)r,   r%   tar_files      r   r/   zTarExtractor.extractz   sf    
K$////<
++K1I1I(T_1`1`aaar   )r2   r3   r4   r@   r   r   r5   r6   r>   rA   rv   r/   r7   r   r   r^   r^   Q   s        (%c	"2 ( ( ( ( [( ! ! \!F E$), 5s;K PT    \  r   r^   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )GzipExtractors   r,   r%   r   Nc                     t          j        | d          5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S NrG   wb)gziprH   shutilcopyfileobj)r,   r%   	gzip_fileextracted_files       r   r/   zGzipExtractor.extract   s    Yz4(( 	>Ik4(( >N"9n===> > > > > > > > > > > > > > >	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>3   A!A	A!	A	A!A	A!!A%(A%	r2   r3   r4   rD   rA   r   r   r5   r/   r7   r   r   r   r      sa         MM>E$), >5s;K >PT > > > \> > >r   r   c                        e Zd Zg dZeddeeef         dede	f fd            Z
edeeef         deeef         dd	fd
            Z xZS )ZipExtractor)s   PKs   PKs   PKr   r   rL   r   c                    t                                          ||          rdS 	 ddlm}m}m}m}m}m}m	}	m
}
m}m} t          |d          5 } |	|          }|r||         dk    r&||         dk    r||         dk    r	 d d d            dS ||         ||         k    r|                    ||                    |                                ||         k    rc||         |
k    rW|                    |
          }t#          |          |
k    r/t%          j        ||          }||         |k    r	 d d d            dS d d d            n# 1 swxY w Y   dS # t(          $ r Y dS w xY w)NrL   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirrG   F)superr>   zipfiler   r   r   r   r   r   r   r   r   r   rH   seektellrI   rO   structunpack	Exception)r<   r   rL   r   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__s                    r   r>   zZipExtractor.is_extractable   sk   77!!$\!BB 	4	                        dD!! 0R$R 
001Q666);LPQ;Q;QV\]hVimnVnVn#	0 0 0 0 0 0 0 0
   01VO5LLL{ 34447799{(;;;y@QUc@c@c#%77>#:#:D"4yyN::*0-8H$*O*O#*=#9=M#M#M+/0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 	 	 	55	sH   (E 3E
E B!E
1E >E 
EE EE 
E%$E%r,   r%   Nc                     t          j        |d           t          j        | d          5 }|                    |           |                                 d d d            d S # 1 swxY w Y   d S )NTrx   r)r   rz   r   ZipFiler{   r|   )r,   r%   zip_files      r   r/   zZipExtractor.extract   s    
K$////_Z-- 	,,,NN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   *A##A'*A'rX   )r2   r3   r4   rD   r@   r   r   r5   rZ   r6   r>   rA   r/   __classcell__)r   s   @r   r   r      s          M " "%c	"2 "% "RV " " " " " ["H E$), 5s;K PT    \    r   r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )XzExtractors   7zXZ r,   r%   r   Nc                     t          j        |           5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr   )lzmarH   r   r   r,   r%   compressed_filer   s       r   r/   zXzExtractor.extract   s   Yz"" 	Dok4(( DN"?NCCCD D D D D D D D D D D D D D D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Ds3   A AA A	A A	A  A$'A$r   r7   r   r   r   r      sk        01MDE$), D5s;K DPT D D D \D D Dr   r   c                   Z    e Zd ZddgZedeeef         deeef         ddfd            ZdS )RarExtractors   Rar! s   Rar! r,   r%   r   Nc                     t           j        st          d          dd l}t	          j        |d           |                    |           }|                    |           |                                 d S )NzPlease pip install rarfiler   Trx   )	r	   RARFILE_AVAILABLEImportErrorrarfiler   rz   RarFiler{   r|   )r,   r%   r   rfs       r   r/   zRarExtractor.extract   sn    ' 	<:;;;
K$////__Z((
k"""





r   r   r7   r   r   r   r      se        (*ABME$), 5s;K PT    \  r   r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )ZstdExtractors   (/r,   r%   r   Nc                 :   t           j        st          d          dd l}|                                }t          | d          5 }t          |d          5 }|                    ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )NzPlease pip install zstandardr   rG   r   )r	   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorrH   copy_stream)r,   r%   zstddctxifhofhs         r   r/   zZstdExtractor.extract   s!   ) 	><===    $$&&*d## 	'sDd,C,C 	'sS#&&&	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	's6   BA8,B8A<	<B?A<	 BBBr   r7   r   r   r   r      sb        ()M'E$), '5s;K 'PT ' ' ' \' ' 'r   r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )Bzip2Extractors   BZhr,   r%   r   Nc                     t          j        | d          5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S r   )bz2rH   r   r   r   s       r   r/   zBzip2Extractor.extract   s   Xj$'' 	D?k4(( DN"?NCCCD D D D D D D D D D D D D D D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Dr   r   r7   r   r   r   r      sk        $%MDE$), D5s;K DPT D D D \D D Dr   r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )SevenZipExtractors   7z'r,   r%   r   Nc                     t           j        st          d          dd l}t	          j        |d           |                    | d          5 }|                    |           d d d            d S # 1 swxY w Y   d S )NzPlease pip install py7zrr   Trx   r   )r	   PY7ZR_AVAILABLEr   py7zrr   rz   SevenZipFiler{   )r,   r%   r   archives       r   r/   zSevenZipExtractor.extract   s    % 	:8999
K$////
C00 	,G{+++	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	,s   A//A36A3r   r7   r   r   r   r      sb        01M,E$), ,5s;K ,PT , , , \, , ,r   r   c                   X    e Zd ZdgZedeeef         deeef         ddfd            ZdS )Lz4Extractors   "Mr,   r%   r   Nc                 &   t           j        st          d          dd l}|j                            | d          5 }t          |d          5 }t          j        ||           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )NzPlease pip install lz4r   rG   r   )r	   LZ4_AVAILABLEr   	lz4.frameframerH   r   r   )r,   r%   lz4r   r   s        r   r/   zLz4Extractor.extract  s7   # 	86777Y^^J-- 	Dk4(( DN"?NCCCD D D D D D D D D D D D D D D	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	D 	Ds5   BA."B.A2	2B5A2	6BB
B
r   r7   r   r   r   r      sk        ()MDE$), D5s;K DPT D D D \D D Dr   r   c            
       N   e Zd ZU eeeeeee	e
ed	Zeeee         f         ed<   ed             Zedeeef         defd            Zeddeeef         ded	efd
            Zedeeef         d	ee         fd            Zedeeef         deeef         ded	dfd            ZdS )r   )	tarr   zipxzrarr   r   7zr   
extractorsc                 b    t          d | j                                        D                       S )Nc              3   r   K   | ]2}t          |t                    |j        D ]}t          |          V  3d S r   )
issubclassrC   rD   rO   )rP   r   extractor_magic_numbers      r   rR   z9Extractor._get_magic_number_max_length.<locals>.<genexpr>  sn       
 
)%=>>
 +4*A	
 
 ' &''
 
 
 
 
 
 
r   )rU   r   values)r<   s    r   _get_magic_number_max_lengthz&Extractor._get_magic_number_max_length  s>     
 
 ^2244
 
 
 
 
 	
r   r   rE   c                 ^    	 t                               | |          S # t          $ r Y dS w xY w)N)rE   r   )rC   rK   rV   )r   rE   s     r   _read_magic_numberzExtractor._read_magic_number$  sC    	+==dXk=lll 	 	 	33	s    
,,Freturn_extractorr   c                     t          j        dt                     |                     |          }|r|sdnd| j        |         fS |sdndS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.)categoryTF)FN)warningswarnFutureWarningr.   r   )r<   r   r   r0   s       r   r>   zExtractor.is_extractable+  sm    4"	
 	
 	
 	

 55d;; 	^/]44dCNK[<\5]],?uu-?r   c                     |                                  }|                     ||          }| j                                        D ] \  }}|                    ||          r|c S !d S )Nr   )r   r   r   itemsr>   )r<   r   magic_number_max_lengthrL   r0   r   s         r   r.   z Extractor.infer_extractor_format7  s    "%"B"B"D"D--d4KLL+.>+?+?+A+A 	( 	('i''<'HH (''''(	( 	(r   r,   r%   r0   Nc                    t          j        t           j                            |          d           t	          t          |                              d                    }t          |          5  t          j	        |d           | j
        |         }|                    ||          cd d d            S # 1 swxY w Y   d S )NTrx   z.lock)ignore_errors)r   rz   r   rj   r5   r   with_suffixr   r   rmtreer   r/   )r<   r,   r%   r0   	lock_pathr   s         r   r/   zExtractor.extract?  s     	BGOOK004@@@@[))55g>>??	i   	> 	>M+T::::'78I$$Z==	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   29B88B<?B<r1   )r2   r3   r4   r^   r   r   r   r   r   r   r   r   r   dictr5   typer9   r[   r@   r   rA   r   r   r\   r   r6   r>   r   r.   r/   r7   r   r   r   r     s         
2 
2JS$}--. 
 
 
 
 
 [
 tSy!1     \ 	@ 	@%c	"2 	@d 	@W[ 	@ 	@ 	@ [	@ (%c	*: (x} ( ( ( [( >$)$> 49%> 	>
 
> > > [> > >r   r   )&r   r   r   r   r   r   r`   r   r   abcr   r   pathlibr   typingr   r    r	   	_filelockr   loggingr   r2   rp   r   r9   rC   r^   r   r   r   r   r   r   r   r   r   r7   r   r   <module>r      s;   



   				      # # # # # # # #       " " " " " " " "                   
H		       <Z Z Z Z ZC Z Z Zh h h h h}c h h h&. . . . .= . . .b> > > > >, > > >1 1 1 1 1+ 1 1 1hD D D D D* D D D    +   ' ' ' ' ', ' ' 'D D D D D- D D D, , , , ,0 , , ,D D D D D+ D D D?> ?> ?> ?> ?> ?> ?> ?> ?> ?>r   