
    &`iZ                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZmZmZ d dlmZ d dlmZmZ  ed	           G d
 d                      ZdS )    N)defaultdict)DictListLiteralOptionalUnion)ActorHandle)DataIteratorDatasetExecutionOptions	NodeIdStr)ExecutionResources)DeveloperAPI	PublicAPIstable)	stabilityc                   F   e Zd ZdZ	 	 	 ddeed         ee         f         deee	e
ee	f         f                  defdZd	ed
efdZdede	fdZede
eef         dedeee                  deee                  dee
eef                  f
d            Zede	fd            ZdS )
DataConfigzClass responsible for configuring Train dataset preprocessing.

    For advanced use cases, this class can be subclassed and the `configure()` method
    overriden for custom data preprocessing.
    allNTdatasets_to_splitexecution_optionsenable_shard_localityc                    t          |t                    s|dk    r|| _        n(t          dt	          |          j         d| d          t                                          t          |t                    r|t          fd          | _
        t          |t                    r| j
                            |           || _        d| _        d| _        dS )a  Construct a DataConfig.

        Args:
            datasets_to_split: Specifies which datasets should be split among workers.
                Can be set to "all" or a list of dataset names. Defaults to "all",
                i.e. split all datasets.
            execution_options: The execution options to pass to Ray Data. Can be either:
                1. A single ExecutionOptions object that is applied to all datasets.
                2. A dict mapping dataset names to ExecutionOptions. If a dataset name
                is not in the dict, it defaults to ``DataConfig.default_ingest_options()``.
                By default, the options are optimized for data ingest. When overriding,
                base your options off ``DataConfig.default_ingest_options()``.
            enable_shard_locality: If true, dataset sharding across Train workers will
                consider locality to minimize cross-node data transfer. Enabled by default.
        r   zV`datasets_to_split` should be a 'all' or a list of strings of dataset names. Received z with value .c                  ,    t          j                   S )N)copydeepcopy)default_execution_optionss   s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/train/_internal/data_config.py<lambda>z%DataConfig.__init__.<locals>.<lambda>9   s    DM";<<     g        N)
isinstancelist_datasets_to_split	TypeErrortype__name__r   default_ingest_optionsr   r   _execution_optionsdictupdate_enable_shard_locality_num_train_cpus_num_train_gpus)selfr   r   r   r   s       @r   __init__zDataConfig.__init__   s   . '.. 	2Cu2L2L&7D##V)**3V VARV V V   %/$E$E$G$G!')9:: 	:(9%?J<<<<@
 @
 '.. 	>#**+<===&;#""r!   num_train_cpusnum_train_gpusc                 "    || _         || _        dS )zSet the total number of CPUs and GPUs used by training.

        If CPU or GPU resource limits are not set, they will be set to the
        total cluster resources minus the resources used by training.
        N)r-   r.   )r/   r1   r2   s      r   set_train_total_resourcesz$DataConfig.set_train_total_resourcesC   s      .-r!   dataset_namereturnc                 @    t          j        | j        |                   S )zKReturn a copy of the configured execution options for a given dataset name.)r   r   r)   )r/   r5   s     r   _get_execution_optionsz!DataConfig._get_execution_optionsM   s    }T4\BCCCr!   datasets
world_sizeworker_handlesworker_node_idsc                 H   d t          |          D             }|                                D ]!\  }}|j        |                    |           "| j        dk    r"t          |                                          }	nt          | j                  }	| j        r|nd}
|                                D ]\  }}|                     |          }|	                                r8|j
                            t          | j        | j                            |_
        |                    |          }||j        _        ||	v r7t%          |                    |d|
                    D ]\  }}|||         |<   t          |          D ]}|                                ||         |<    |S )a  Configure how Train datasets should be assigned to workers.

        Args:
            datasets: The datasets dict passed to Train by the user.
            world_size: The number of Train workers in total.
            worker_handles: The actor handles of the Train workers.
            worker_node_ids: The node ids of the Train workers.
            kwargs: Forwards compatibility placeholder.

        Returns:
            A list of dataset splits for each worker. The size of the list must be
            equal to `world_size`. Each element of the list contains the assigned
            `DataIterator` instances by name for the worker.
        c                     g | ]}i S  r?   ).0_s     r   
<listcomp>z(DataConfig.configure.<locals>.<listcomp>h   s    000"000r!   Nr   )cpugpuT)equallocality_hints)rangeitemsnameset_namer$   setkeysr,   r8   is_resource_limits_defaultexclude_resourcesaddr   r-   r.   r   contextr   	enumeratestreaming_splititerator)r/   r9   r:   r;   r<   kwargsoutputr5   datasetr   rF   rI   dsr   isplits                   r   	configurezDataConfig.configureQ   s   . 10eJ//000%-^^%5%5 	/ 	/!L'|#  ..."e++ #HMMOO 4 4 #D$; < <,0,GQT (( 	4 	4HD" $ ; ;D A A ;;== 	 &7;;* $ 4$:N    "3 B+<BJ(((( )&&"$~ '  ! ! , ,HAu
 ',F1IdOO, z** 4 4A&(kkmmF1IdOO4 r!   c                      t           j        j                                        } t	          | j        j        | j        j        | j        j        | j        j	        | j        j
                  S )zThe default Ray Data options used for data ingest.

        By default, configurations are carried over from what is already set
        in DataContext.
        )locality_with_outputresource_limitsrN   preserve_orderverbose_progress)raydataDataContextget_currentr   r   r\   r]   rN   r^   r_   )ctxs    r   r(   z!DataConfig.default_ingest_options   s`     h"..00 "%!6!K1A!3E0? 2C
 
 
 	
r!   )r   NT)r'   
__module____qualname____doc__r   r   r   strr   r   r   boolr0   floatr4   r8   r   r   intr	   r   r
   rZ   staticmethodr(   r?   r!   r   r   r      s         ?D &*-# -# c!:;-# $"D.>)>$??@
-#  $-# -# -# -#^. .u . . . .D3 D;K D D D D >sG|$> > !k!23	>
 "$y/2> 
d3$%	&> > > \>@ 
$4 
 
 
 \
 
 
r!   r   )r   collectionsr   typingr   r   r   r   r   r`   	ray.actorr	   ray.datar
   r   r   r   9ray.data._internal.execution.interfaces.execution_optionsr   ray.util.annotationsr   r   r   r?   r!   r   <module>rs      s	    # # # # # # 7 7 7 7 7 7 7 7 7 7 7 7 7 7 



 ! ! ! ! ! ! G G G G G G G G G G G G X X X X X X 8 8 8 8 8 8 8 8 XU
 U
 U
 U
 U
 U
 U
 U
 U
 U
r!   