
    Pi]9                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ ej         j!        "                    e#          Z$e
rd dl%Z%d dl&Z%e G d dej'                              Z(ddde)e*         fdZ+	 dddde)e*         dee,         fdZ- G d de          Z. G d dej/                  Z0dS )    N)Iterable)	dataclass)islice)TYPE_CHECKINGOptionalUnion)ArrowWriterParquetWriter)MAX_SHARD_SIZE)is_remote_filesystemrename)_BaseExamplesIterable)experimental)convert_file_size_to_intc                   H     e Zd ZU dZdZeej                 ed<    fdZ	 xZ
S )SparkConfigzBuilderConfig for Spark.Nfeaturesc                 H    t                                                       d S N)super__post_init__)self	__class__s    y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/datasets/packaged_modules/spark/spark.pyr   zSparkConfig.__post_init__%   s        )__name__
__module____qualname____doc__r   r   datasetsFeatures__annotations__r   __classcell__r   s   @r   r   r      sX         "",0Hhx()000                 r   r   dfpyspark.sql.DataFramenew_partition_orderc                    |                      d                              d|d                    }|dd          D ]B}|                      d                              d|           }|                    |          }C|S )N*z
part_id = r      )selectwhereunion)r%   r'   df_combinedpartition_idpartition_dfs        r   _reorder_dataframe_by_partitionr1   )   s    ))C..&&'L4G4J'L'LMMK+ABB/ 6 6yy~~++,G,G,GHH!''55r   partition_order
state_dictc              #      K   dd l }|                     d|j        j                                                            d                    }|r|d         nd}t          |||d                    }|                    d          }d }|r|d         nd}	t          ||	d           D ]h}
|
	                                }|d         }|
                    d           ||k    r|r||dxx         dz  cc<   |}d}	|r|	dz   |d<   ||	f|fV  |	dz  }	id S )	Nr   r)   part_idpartition_idxT)prefetchPartitionspartition_example_idxr*   )pysparkr+   sql	functionsspark_partition_idaliasr1   toLocalIteratorr   asDictpop)r%   r2   r3   r9   df_with_partition_idpartition_idx_startr0   rowscurr_partitionrow_idrowrow_as_dictr5   s                r   _generate_iterable_examplesrH   1   sj     
 NNN99S'+*?*R*R*T*T*Z*Z[d*e*eff9CJ*_5523GYlYmYmInooL''4'@@DN4>EZ/00AFdFD))  jjlli(	"""W$$ 1n8?+++q0+++$NF 	=281*J./,,,,! r   c                        e Zd Z	 d	 d fdZdefdZededef fd            Zd	 Zd
e	j
        j        dd fdZddededd fdZedefd            Z xZS )SparkExamplesIterableNr%   r&   c                     t                                                       || _        |p*t          | j        j                                                  | _        d S r   )r   __init__r%   rangerddgetNumPartitionsr2   )r   r%   r2   r   s      r   rL   zSparkExamplesIterable.__init__O   sM    
 	.W%8T8T8V8V2W2Wr   returnc                 $    ddd| _         | j         S )Nr   )r6   r8   )_state_dictr   s    r   _init_state_dictz&SparkExamplesIterable._init_state_dictX   s    -.KKr   r3   c                 F    t                                          |          S r   )r   load_state_dict)r   r3   r   s     r   rV   z%SparkExamplesIterable.load_state_dict\   s    ww&&z222r   c              #   V   K   t          | j        | j        | j                  E d {V  d S r   )rH   r%   r2   rR   rS   s    r   __iter__zSparkExamplesIterable.__iter__`   s8      .tw8LdN^___________r   	generatorc                     t          t          | j        j                                                            }|                    |           t          | j        |          S )Nr2   )listrM   r%   rN   rO   shufflerJ   )r   rY   r2   s      r   shuffle_data_sourcesz*SparkExamplesIterable.shuffle_data_sourcesc   sP    uTW[%A%A%C%CDDEE/***$TWoNNNNr   T
num_shardsindexc                 ^    |                      |||          }t          | j        |          S )N)r_   r`   
contiguousr[   )split_shard_indices_by_workerrJ   r%   )r   r_   r`   rb   r2   s        r   shard_data_sourcesz(SparkExamplesIterable.shard_data_sourcesh   s3    <<
Z_lv<ww$TWoNNNNr   c                 *    t          | j                  S r   )lenr2   rS   s    r   r_   z SparkExamplesIterable.num_shardsl   s    4'(((r   r   )r%   r&   )T)r   r   r   rL   dictrT   r   rV   rX   nprandom	Generatorr^   intrd   propertyr_   r#   r$   s   @r   rJ   rJ   N   sB        X#X X X X X X $         3$ 34 3 3 3 3 3 \3` ` `Obi.A OF] O O O O
O OS O ORi O O O O )C ) ) ) X) ) ) ) )r   rJ   c                       e Zd ZeZ	 	 ddddedef fdZd Zd Zd	e	j
        j        j        fd
Zd Zdedededeeeeeeef         f                  fdZ	 	 	 ddddedeeeef                  dee         fdZdddefdZ xZS )SparkNr%   r&   	cache_dirworking_dirc                     dd l }|j        j        j                                        | _        || _        || _         t                      j	        d|t          | j                                                  d| d S )Nr   )ro   config_name )r9   r:   SparkSessionbuildergetOrCreate_sparkr%   _working_dirr   rL   strsemanticHash)r   r%   ro   rp   config_kwargsr9   r   s         r   rL   zSpark.__init__t   s     	k.6BBDD' 	
DG002233	
 	
 	
 	
 	
 	
 	
r   c                    | j         fd}| j        j                            dd                              d          rd S | j         ry| j        j                            t          d          d                              |          	                                }t          j                            |d                   rd S t          d          )Nc                     t          j        d           t           j                            dt	          j                    j        z             }t          |d           |gS )NT)exist_okfs_testa)osmakedirspathjoinuuiduuid4hexopen)context
probe_filero   s     r   create_cache_and_write_probez?Spark._validate_cache_dir.<locals>.create_cache_and_write_probe   sW     K	D1111iTZ\\=M1MNNJ S!!!<r   zspark.master localr*   r   ztWhen using Dataset.from_spark on a multi-node cluster, the driver and all workers should be able to access cache_dir)
_cache_dirrw   confget
startswithsparkContextparallelizerM   mapPartitionscollectr   r   isfile
ValueError)r   r   probero   s      @r   _validate_cache_dirzSpark._validate_cache_dir   s     O		  	  	  	  	  ;33>>wGG 	F
 ? 	(44U1XXqAAOOPlmmuuww  w~~eAh''  C
 
 	
r   c                 @    t          j        | j        j                  S )N)r   )r    DatasetInfoconfigr   rS   s    r   _infozSpark._info   s    #T[-ABBBBr   
dl_managerc                 L    t          j        t           j        j                  gS )N)name)r    SplitGeneratorSplitTRAIN)r   r   s     r   _split_generatorszSpark._split_generators   s    'X^-ABBBCCr   c                 :   dd l }d }| j                                        }|dk    r|nd}| j                            |                              d                              |d                              |j        j        	                    d          
                    d                                                    d         j        |z  }||z  }||k    rAt          |t          ||z                      }| j                            |          | _        d S d S )Nr   c              3   f   K   | D ]+}t           j                            d|j        gi          V  ,d S )Nbatch_bytes)paRecordBatchfrom_pydictnbytes)itbatchs     r   get_arrow_batch_sizez=Spark._repartition_df_if_needed.<locals>.get_arrow_batch_size   sK       R Rn00-%,1PQQQQQQR Rr   d   r*   zbatch_bytes: longr   sample_bytes)r9   r%   countlimitrepartition
mapInArrowaggr:   r;   sumr=   r   r   minrk   )	r   max_shard_sizer9   r   df_num_rowssample_num_rowsapprox_bytes_per_rowapprox_total_sizenew_num_partitionss	            r   _repartition_df_if_neededzSpark._repartition_df_if_needed   s   	R 	R 	R gmmoo)4););++ GMM/**[^^Z,.ABBS&**=99??OOPPWYYq	
  	 1;>~--!$[#6G.6X2Y2Y!Z!Zg))*<==DGGG .-r   fpathfile_formatr   rP   c           	   #     	
K   dd l 	|dk    rt          nt          | j        rBt          j                            | j        t          j                                                n|dk    | j        j	        | j
        | j        j        
	
f	d}| j                            |d                              d                              	j        j                            d                              d          	j        j                            d                              d	          	j        j                            d                              d
          	j        j                            d                              d                                                    }|D ]%}|j        |j        |j        |j        |j        ffV  &d S )Nr   parquetc           	   3     	K    j                                                     }t          | d           }|)t          j                            |gdgdggg d          S d}                     d|d                              d|d                    }t          j                            |g          }|	                    |           | D ]}|j
        k    r|                                \  }}|                                 t          j                            |g|g|ggg d          V  |dz  } |j                            d|d                              d|d                    }t          j                            |g          }|	                    |           |j
        dk    rV|                                \  }}|                                 t          j                            |g|g|ggg d          V  k    rt          j        t          j                                                D ]y}	t          j                            t          j                                      t          j                            |	                    }
t(                              |	|
           xd S d S )	Nr   )task_idnum_examples	num_bytes)namesSSSSS05dTTTTT)r   r   writer_batch_sizestorage_optionsembed_local_filesr*   )TaskContexttaskAttemptIdnextr   r   from_arraysreplaceTablefrom_batcheswrite_table
_num_bytesfinalizeclose	_featuresr   listdirr   dirnamer   basenameshutilmove)r   r   first_batchshard_idwritertabler   r   r   filedestr   r   r   r   r9   r   working_fpathr   writer_classs              r   write_arrowz0Spark._prepare_split_single.<locals>.write_arrow   s     )g)++99;;Gr4..K"~11YaS)BBB 2    H!\!"**7x4E4EFFNNw[bXhXhii"3 /"3  F H));-88Eu%%% * *!-&2C~2U2U.4oo.?.?+L)LLNNN.44!\NYK@FFF 5      MH)\!'!1*227x<M<MNNVVW^cj`p`pqq*;(7*;  F --ug66""5)))) 1$$*0//*;*;'in00Y<BBB 1     
 %%Jrw}'E'EFF , ,D7<<(>(>@P@PQU@V@VWWDKKd++++ &%, ,r   z2task_id: long, num_examples: long, num_bytes: longr   r   total_num_examplesr   total_num_bytesr_   shard_lengths)r9   r
   r	   rx   r   r   r   r   r   r   _writer_batch_size_fsr   r%   r   groupByr   r:   r;   r   r=   r   collect_listr   r   r   r   r_   r   )r   r   r   r   r   statsrF   r   r   r9   r   r   r   r   s    ` `   @@@@@@@r   _prepare_split_singlezSpark._prepare_split_single   s      	(3y(@(@}}kTXTepT%68H8H8O8OPPPkp'94 ;' 3(22	, 2	, 2	, 2	, 2	, 2	, 2	, 2	, 2	, 2	, 2	, 2	, 2	,j G{,`aaWYS%)).99??@TUU%))+66<<=NOO%++K88>>|LL%22>BBHHYY	  WYY 	  	p 	pC+ 68KS^]`]noooooo	p 	pr   arrowsplit_generatorzdatasets.SplitGeneratornum_procc                 0   |                                   t          |pt                    }|                     |           t	          | j                   }|rt          j        j        nt          j        }d}| j
         d|j
         | d| }	 || j        |	          d}
d}dg }g }|                     ||          D ]M\  }}|\  }}}}|dk    r;|
|z  }
||z  }|z  |                    ||f           |                    |           N|
|j        _        ||j        _        t$                              d d           dk    r||j        _        | j        dt*          d	t*          d
t*          ffdg }d}t-          t/          |                    D ]<}||         \  }}t-          |          D ]}|                    |||g           |dz  } =| j        j                            |t/          |                                        fd                                           d S d}|d         d         }|                                         d|d                              d|d                              |d                     d S )Nz-TTTTT-SSSSS-of-NNNNN-.r   z	Renaming z shards.r*   r   r   global_shard_idc           	          t                              d|d                              d| d                              d|d                              dd                     d S )Nr   r   r   zTTTTT-SSSSSNNNNN)r   r   )r   r   r   r   fstotal_shardss      r   _rename_shardz+Spark._prepare_split.<locals>._rename_shardO  s    
 MM'h+<+<==EEgRYO_O_``MM-O1I1IJJRRSZ_k\q\qrr    r   c                      |  S r   rs   )argsr   s    r   <lambda>z&Spark._prepare_split.<locals>.<lambda>a  s    S`S`bfSg r   r   r   r   r   )r   r   r   r   r   r   r   r   r   	posixpathr   _output_dirr   appendextend
split_infor   r   loggerdebugr   rk   rM   rf   rw   r   r   mapr   _renamer   )r   r   r   r   r   kwargsis_local	path_joinSUFFIXfnamer   r   task_id_and_num_shardsall_shard_lengthsr   contentr   r   r_   r   r   r   ir   r   r   r   r   s                           @@@@r   _prepare_splitzSpark._prepare_split  s&    	  """1.2RNSS&&~666+DH555$,@BGLL).	(9KK3KVKKkKK	$*E22!# $ : :5+~ ^ ^ 	8 	8GW 1}}"l2"9,
*&--w
.CDDD!((7772D"//>", 	7777888!7HO&4 B			 "%	 	 	 	 	 	 	 	 DO356677 ) )&<Q&?# %j 1 1 ) )HKK(O DEEE#q(OO) K$00s4yyAAEEFgFgFgFghhpprrrrr H,Q/2GLLg('8'899AA'gK[K[\\fb))    r   c                 *    t          | j                  S r   )rJ   r%   )r   r   s     r    _get_examples_iterable_for_splitz&Spark._get_examples_iterable_for_splitk  s     %TW---r   )NN)r   NN)r   r   r   r   BUILDER_CONFIG_CLASSry   rL   r   r   r    downloaddownload_managerDownloadManagerr   r   rk   r   tupleboolr   r   r   r  rJ   r  r#   r$   s   @r   rn   rn   q   s       &
 	
 
#
 
 	
 
 
 
 
 
&
 
 
BC C CDH,=,N,^ D D D D> > >2RpRp Rp 	Rp
 
%T5e#445	6Rp Rp Rp Rpn #48"&N N2N N !sCx1	N
 3-N N N N`.2. 
. . . . . . . .r   rn   r   )1r   r   r   collections.abcr   dataclassesr   	itertoolsr   typingr   r   r   numpyrh   pyarrowr   r    datasets.arrow_writerr	   r
   datasets.configr   datasets.filesystemsr   r   datasets.iterable_datasetr   datasets.utilsr   datasets.utils.py_utilsr   utilslogging
get_loggerr   r  r9   pyspark.sqlBuilderConfigr   r\   rk   r1   rg   rH   rJ   DatasetBuilderrn   rs   r   r   <module>r-     si   				      $ $ $ $ $ $ ! ! ! ! ! !       1 1 1 1 1 1 1 1 1 1          < < < < < < < < * * * * * *        < ; ; ; ; ; ' ' ' ' ' ' < < < < < < 
		*	*8	4	4 NNN          ((      (? VZ[^V_     "& #Y    : )  )  )  )  )1  )  )  )F~. ~. ~. ~. ~.H# ~. ~. ~. ~. ~.r   