
    &`iw                     T   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlmc mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: d dl;m<Z<m=Z=  ed          Z> e j?        e@          ZA G d deB          ZC G d deB          ZDe G d d                      ZE G d d          ZF G d deB          ZG G d d           ZHd!eIfd"ZJdS )#    N)defaultdict)	dataclass)AnyCallableDictListOptionalTupleTypeTypeVar)HIP_VISIBLE_DEVICES_ENV_VAR)NEURON_RT_VISIBLE_CORES_ENV_VAR)!ASCEND_RT_VISIBLE_DEVICES_ENV_VAR)CUDA_VISIBLE_DEVICES_ENV_VAR)env_integer)Dataset)RayActorError)
Checkpoint
DataConfig)	TrialInfo_TrainingResultget_sessioninit_sessionshutdown_session)StorageContext)check_for_failure)WorkerGroup)BackendConfig)&ENABLE_DETAILED_AUTOFILLED_METRICS_ENV%ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV$ENABLE_SHARE_HIP_VISIBLE_DEVICES_ENV)ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV'ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENVRAY_TRAIN_ENABLE_STATE_TRACKINGTRAIN_ENABLE_WORKER_SPREAD_ENV#TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV)get_current_placement_groupremove_placement_groupTc                       e Zd ZdZdS )TrainBackendErrorz?Errors with BackendExecutor that should not be exposed to user.N__name__
__module____qualname____doc__     x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/train/_internal/backend_executor.pyr+   r+   .   s        IIIIr2   r+   c                       e Zd ZdZdS )TrainingWorkerErrorz)Raised if a worker fails during training.Nr,   r1   r2   r3   r5   r5   2   s        3333r2   r5   c                   2    e Zd ZU dZeed<   eed<   eed<   dS )ResourceConfiga  
    Resource configuration for resource_ids to share between workers.

    Args:
        resource_name: The name of the resource to configure
         (Example: "neuron_cores" or "gpu").
        resource_enable_sharing_env_var: The environment variable to
         check if the resource should be shared.
        share_resource_ids_env_var: The environment variable to configure for
         sharing the resources with other workers.
    resource_nameresource_enable_sharing_env_varshare_resource_ids_env_varN)r-   r.   r/   r0   str__annotations__r1   r2   r3   r7   r7   6   sB         
 
 %(((( ######r2   r7   c                       e Zd ZdZ	 	 	 	 d2dedee         dedeee	e
f                  d	ef
d
Z	 	 	 	 d3deeg df                  dee         dee         dee         fdZd Zd Zde	de	fdZde	de	fdZdee         fdZ	 d4deg ef         dee	ef         dee	ef         dededee         ddfd Zdeee                  fd!Zd" Zd# Z 	 	 	 d5d%e!d&ee         d'ee	         fd(Z"d) Z#d6d+e!fd,Z$d- Z%d. Z&d/ Z'd0 Z(d1 Z)dS )7BackendExecutora  Main execution class for training backends.

    This class holds a worker group and is responsible for executing the
    training function on the workers, and collecting intermediate results
    from ``session.report()``.

    Args:
        backend_config: The configurations for this
            specific backend.
        num_workers: Number of workers to use for training.
        resources_per_worker (Optional[Dict[str, float]]):
            Dictionary specifying the resources that will be
            requested for each worker. Defaults to {"CPU": 1}.
        max_retries: Number of retries when Ray actors fail.
            Defaults to 3. Set to -1 for unlimited retries.
    N      backend_config
trial_infonum_workersresources_per_workermax_retriesc                    |
ddi| _         n|                                | _         || _        |                                | _        || _        || _        | j        dk     rt          d          | _        d| _        d | _	        d | _
        d | _        || _        t                      | _        d | _        t!          t"          j        t&          t(                    t!          t"          j        t,          t.                    t!          t"          j        t2          t4                    g| _        t9          t;          j                    dz            | _        t?          t@          d          | _!        d S )NCPUr?   r   inf  )"_resources_per_workercopy_backend_configbackend_cls_backend_num_workers_max_failuresfloat_num_failures_last_failure_initialization_hook_placement_group_trial_infoInactiveWorkerGroupworker_groupdataset_shardsr7   ray_constantsNEURON_CORESr"   r   NPUr#   r   GPUr!   r   _resource_configsinttime_start_time_msr   r$   state_tracking_enabled)selfrA   rB   rC   rD   rE   s         r3   __init__zBackendExecutor.__init__[   sD     '*/D&&)=)B)B)D)DD&-&2244'(!!!&uD!$(! $%/11" *9/ 
 !71  !4+ "
* "$)++"455&12QST&U&U###r2   initialization_hook	train_clstrain_cls_argstrain_cls_kwargsc                    |                                   | j        pd}t          | j        | j        ||||          | _        | j        r| j        j        nd}| j                            |           	 |r!|| _	        | j        
                    |           ddlm dffd}| j        
                    |                                           t          t          t           | j        j                            }| j                            dd          dk    r|r|                                  | j        D ]B}	|                     |	j        |	j                  r |                     |	j        |	j                   C| j                            | j        | j                   n# t:          $ rs}
t<                              tA          |
                     t<          !                    d	           | "                                 | #                                 Y d}
~
nd}
~
ww xY w| j$        r'dd
l%m&} ddl'm(}  | |                      | _)        dS dS )zStarts the worker group.default)rC   rD   	actor_clsactor_cls_argsactor_cls_kwargsplacement_groupNr   )DataContextctxc                 2                         |            d S N)_set_current)rp   ro   s    r3   _set_driver_dataset_contextz:BackendExecutor.start.<locals>._set_driver_dataset_context   s    ((-----r2   r]   zXFailure occurred during startup. Restarting all workers and attempting to startup again.)TrainRunStateManager)get_state_actor)state_actor)*_create_placement_grouprU   r   rO   rJ   rX   rV   driver_node_id"sort_workers_by_node_id_and_gpu_idrT   executeray.dataro   get_currentboolr   r    rN   share_cuda_visible_devicesget_share_cuda_visible_devicesr^   _is_share_resources_enabledr8   r9   _share_resource_idsr:   on_startrL   r   logger	exceptionr;   warning_increment_failures_restartrb   ray.train._internal.stateru   %ray.train._internal.state.state_actorrv   state_manager)rc   re   rf   rg   rh   rn   trial_driver_node_idrt   "share_cuda_visible_devices_enabledresource_configexcru   rv   ro   s                @r3   startzBackendExecutor.start   s    	$$&&&/<9')!%!;)-+
 
 
" 04/?ID++T 	 	<<=QRRR-	" ?,?)!))*=>>> -,,,,,. . . . . . . %%+''))  
 269M< 2 2. *..ua881<<6 = 00222#'#9  33#1#C   ,,'5'B   M""4#4d6JKKKK 	 	 	SXX&&&NN/   $$&&&MMOOOOOOOO	 & 	UFFFFFFMMMMMM!5!5//BSBS!T!T!TD		U 	Us   3D.F" "
H,A)HHc                     t                      }t          j        j        j        }|j        }|du p| }|r# fdt           j                  D             }t          t          t          d                    }|rdnd}t          j                            ||          }t                              d           t          t          d          }	t          j        |                                g|		          \  }
}|
rt                              d
           n9t%          d                    t          j                    |j                            | _        dS dS )a  Creates a placement group if it does not exist.

        If a placement group is already detected (Tune) this will be a no-op.

        By default the placement group will be created with PACK strategy.
        This is optimized for colocating GPUs on a minimal number of nodes.
        This behavior can be overridden to use the SPREAD strategy by defining
        ``TRAIN_ENABLE_WORKER_SPREAD_ENV``

        If a placement group is created it will be stored as
        self._placement_group.
        Nc                 B    g | ]}j                                         S r1   )rJ   rK   ).0_rc   s     r3   
<listcomp>z;BackendExecutor._create_placement_group.<locals>.<listcomp>   s5       67*//11  r2   r   SPREADPACK)strategyz%Waiting for placement group to start.d   )timeoutzPlacement group has started.a  Placement group creation timed out. Make sure your cluster either has enough resources or use an autoscaling cluster. If you are running on a cluster, make sure you specify an address in `ray.init()`, for example, `ray.init("auto")`. You can also increase the timeout by setting the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. Current resources available: {}, resources requested by the placement group: {})r'   ray_privateworkerglobal_worker-should_capture_child_tasks_in_placement_grouprangerO   r~   r   r%   utilrn   r   debugr&   waitreadyTimeoutErrorformatavailable_resourcesbundle_specsrU   )rc   current_placement_groupr   r   should_create_placement_groupbundles
use_spreadr   rn   r   r   r   s   `           r3   rx   z'BackendExecutor._create_placement_group   s~    #>"?"?$2@ 	6 $t+ A@@ 	&
 ) 	4   ;@AR;S;S  G k*H!LLMMJ#-9xx6H!h66w6RROLL@AAA!"EsKKGx!6!6!8!8 97KKKHE1 ;<<<<"* +1&/11?3O+ +   %4D!!!7	4 	4r2   c                 P    |                      t          j        t                     dS )aF  Sets CUDA_VISIBLE_DEVICES on all workers.

        For each worker, CUDA_VISIBLE_DEVICES will be set to the GPU IDs
        visible to all workers on that worker's node.

        This allows GPU workers on the same node to communicate with one
        another.

        Example:

            Setup:
            - Node1:
                - Worker1: {0, 1}
                - Worker2: {2, 3}
            - Node2:
                - Worker3: {0, 1}

            CUDA_VISIBLE_DEVICES:
            - Worker1: "0,1,2,3"
            - Worker2: "0,1,2,3"
            - Worker3: "0,1"

        N)r   rZ   r]   r   rc   s    r3   r   z+BackendExecutor._share_cuda_visible_devices  s$    0 	  !24PQQQQQr2   resourceenv_varc                 N   fd| j         j        D             }t          t                    }t          t                    }t	          |          D ]>\  }\  }}||                             |           ||                             |           ?g }	|                                D ]h\  }}t          |          }d	                    |          fd}
||         D ]0}|	
                    | j                             ||
                     1it          j        |	           dS )a  Sets the given env_var on all workers.

        For each worker, the cores/devices are visible to all the
        workers on that worker's node.This allows workers on the
        same node to communicate with one another.

        Example:

            Setup:
            - Node1:
                - Worker1: {0, 1}
                - Worker2: {2, 3}
            - Node2:
                - Worker3: {0, 1}

            NEURON_RT_VISIBLE_CORES/TPU_VISIBLE_CHIPS/...:
            - Worker1: "0,1,2,3"
            - Worker2: "0,1,2,3"
            - Worker2: "0,1"

        Args:
            resource: The name of the resource/accelerator.
            env_var: The name of the environment variable to set.
        c                 L    g | ] }|j         j        |j         j                 f!S r1   )metadatanode_idresource_ids)r   wr   s     r3   r   z7BackendExecutor._share_resource_ids.<locals>.<listcomp>M  sB     %
 %
 %

  
"
'1%
 %
 %
r2   ,c                  &     t           j        <   d S rr   )osenviron)all_resource_idsr   s   r3   set_resource_idsz=BackendExecutor._share_resource_ids.<locals>.set_resource_ids`  s    &6
7###r2   N)rX   workersr   set	enumerateaddupdateitemssortedjoinappendexecute_single_asyncr   r   )rc   r   r   node_ids_and_resource_idsnode_id_to_worker_idnode_id_to_resource_ids	worker_idr   r   futuresr   r   s    ``        @r3   r   z#BackendExecutor._share_resource_ids4  st   2%
 %
 %
 %

 &.%
 %
 %
!  +3//"-c"2"22;<U2V2V 	B 	B.I. )--i888#G,33LAAAA%<%B%B%D%D 
	 
	!G\!,//L"xx557 7 7 7 7 7 2':  	%::9FVWW    	r2   r8   enable_sharing_envc                 n    | j                             |d          dk    }|ot          j        |d          S )a  Whether to share resource IDs on all workers
        based on enable_sharing_env.

        This will return true if resources are requested and greater than 0.
        Also, user can disable by configuring the `enable_sharing_env` to "0".

        Args:
            resource_name: The name of the resource/accelerator.
            enable_sharing_env: The name of the environment variable
                to check.
        r   T)rJ   r   rZ   env_bool)rc   r8   r   has_resource_requesteds       r3   r   z+BackendExecutor._is_share_resources_enabledi  sD     "&!;!?!?q!Q!QTU!U% 
-*@+
 +
 	
r2   returnc                 ~  	
 i 	i }i 
i }d}t          t                    }t          t          | j                            D ]T}| j        j        |         }|j        j        }||         	|<   ||xx         dz  cc<   ||vr
|||<   |dz  }||         
|<   Ut          t          | j                            D ]+}| j        j        |         }|j        j        }||         ||<   ,d                    	
fdt          | j        j                  D                       }t                              d|            	|
fS )ao  Create rank and world size mappings for workers.
        There are three maps returned:
            - local_rank_map, which maps from worker world_rank to local_rank.
            - local_world_size_map, which maps from world_rank to local_world_size
            - node_rank_map, which maps from world rank to node rank

        Example:
            Worker 0: node 0
            Worker 1: node 0
            Worker 2: node 1
            Worker 3: node 0
            Worker 4: node 1

            Workers 0, 1, 3 are on node 0.
            Workers 2, 4 are on node 1.

            Expected local_rank_map:
            {
                0 -> 0,
                1 -> 1,
                2 -> 0,
                3 -> 2,
                4 -> 1
            }

            Expected local_world_size_map:
            {
                0 -> 3,
                1 -> 3,
                2 -> 2,
                3 -> 3,
                4 -> 2
            }

            Expected node_rank_map:
            {
                0 -> 0,
                1 -> 0,
                2 -> 1,
                3 -> 0,
                4 -> 1
            }

        r   r?   
c                     g | ]C\  }}d |j         j         d|j         j         d|j         j         d| d|          d|          DS )z- (node_id=z, ip=z, pid=z) world_rank=z, local_rank=z, node_rank=)r   r   node_ippid)r   ir   local_rank_mapnode_rank_maps      r3   r   zDBackendExecutor._create_rank_world_size_mappings.<locals>.<listcomp>  s        AqPaj0 P Pqz7I P Pz~P P45P P,Q/P P=J1=MP P  r2   z'Started distributed worker processes: 
)r   r_   r   lenrX   r   r   r   r   r   r   info)rc   local_world_size_mapnode_idsnode_cntnode_id_dict
world_rankr   r   workers_infor   r   s            @@r3    _create_rank_world_size_mappingsz0BackendExecutor._create_rank_world_size_mappingsz  s   Z !"
 
  D$5 6 677 		: 		:J&.z:Fo-G)5g)>N:&!!!Q&!!!h&&$,!A(0(9M*%%D$5 6 677 	E 	EJ&.z:Fo-G/;G/D ,,yy     &d&7&?@@	  
 
 	M|MMNNN3]BBr2   
train_funcdatasetsr   data_configstorage
checkpointc                 N   t          t          d          fd}| j        \d | j        j        D             }d | j        j        D             }	|                    |t          | j                  ||	          | _        |                                 \  }
}}g }t          t          | j                            D ]p}|	                    | j        
                    ||||
|         ||         ||         t          | j                  | j        || j        |         |||                     q| j                            | j        | j                   |                     |           | j        rddlm} t&          j                                        }| j                            | j        j        | j        j        |                                |                                || j        | j        |j        | j        g| j        z  		  	         d
 }| j                             |           dS )aW  Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func: The training function to run on each worker.
            datasets: The base datasets.
            data_config: The config object for creating dataset shards for workers.
            checkpoint: The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``session.get_checkpoint()``. If this
                is ``None`` then no checkpoint will be loaded.
        r   c                 z    	 t          | ||||||||	||
           d S # t          $ r t          d          w xY w)N)training_funcr   
local_rank	node_ranklocal_world_size
world_sizerB   dataset_shardr   r   detailed_autofilled_metricsr   zAttempting to start training but a previous training run is still ongoing. You must call `finish_training` before calling `start_training` again.)r   
ValueErrorr+   )r   r   r   r   r   r   rB   r   r   r   r   use_detailed_autofilled_metricss              r3   initialize_sessionz:BackendExecutor.start_training.<locals>.initialize_session  s    ",))'%5))"/%)0O#         '6  s     :Nc                     g | ]	}|j         
S r1   )actorr   r   s     r3   r   z2BackendExecutor.start_training.<locals>.<listcomp>  s    KKKvflKKKr2   c                 &    g | ]}|j         j        S r1   )r   r   r   s     r3   r   z2BackendExecutor.start_training.<locals>.<listcomp>  s    XXXF/XXXr2   )r   worker_handlesworker_node_ids)r   r   r   r   r   rB   r   r   r   r   r   )RunStatusEnum)	run_idrun_namejob_idcontroller_actor_idr   rX   start_time_ms
run_status	resourcesc                  J    t                      } |                                  d S rr   )r   r   sessions    r3   train_asyncz3BackendExecutor.start_training.<locals>.train_asyncF  s    !mmGMMOOOOOr2   )!r   r   rY   rX   r   	configurer   r   r   r   r   rV   rN   on_training_startrL   get_with_failure_handlingrb    ray.train._internal.state.schemar   r   runtime_contextget_runtime_contextr   register_train_runr   experiment_name
get_job_idget_actor_idra   RUNNINGrJ   rO   execute_async)rc   r   r   r   r   r   r   r   actorsr   r   r   r   r   indexr   core_contextr  r   s                     @r3   start_trainingzBackendExecutor.start_training  sz   , +62A+
 +
'
"	 "	 "	 "	 "	H &KK1B1JKKKFXXd>O>WXXXH"-"7"7t011% (	 #8 # #D 1133		
  3t01122 	 	ENN!66&$-e4+E2%9%%@"4#455#/)"&"5e"<%)# 7     $ 	''(94;OPPP&&w/// & 	FFFFFF.BBDDL11'.)9#..00$0$=$=$?$?!!."1(0569JJ 2 
 
 
	 	 	 	''44444r2   c                     d }| j                             |          }|                     |          }t          d |D                       r*t	          d |D                       st          d          dS |S )az  Fetches the next ``_TrainingResult`` from each worker.

        Each ``_TrainingResult`` is expected to correspond to the same step from
        each worker (e.g. the same call to ``train.report()``).

        Returns:
            A list of ``_TrainingResult``s or ``None`` if there are no more results
            since the training function has exited on all workers.
        c                      t          d          } 	 |                                 }n# t          $ r t          d          w xY w|S )Nget_next_resultszs`get_next_results` has been called before `start_training`. Please call `start_training` before `get_next_results`.)_get_sessionget_nextRuntimeErrorr+   )r  results     r3   r  z2BackendExecutor.get_next_results.<locals>.get_nextW  sa    "#566G	 ))++   '*   Ms	   & A c              3      K   | ]}|d u V  	d S rr   r1   r   rs     r3   	<genexpr>z3BackendExecutor.get_next_results.<locals>.<genexpr>k  s&      **QqDy******r2   c              3      K   | ]}|d u V  	d S rr   r1   r  s     r3   r   z3BackendExecutor.get_next_results.<locals>.<genexpr>m  s&      22QqDy222222r2   zSome workers returned results while others didn't. Make sure that `session.report()` are called the same number of times on all workers.N)rX   r  r  anyallr  )rc   r  r   resultss       r3   r  z BackendExecutor.get_next_resultsL  s    	 	 	  #11(;;0099 **'***** 	22'22222 	";   tr2   c                 j    d }| j                             |          }|                     |           dS )zDisable workers from enqueuing results from ``session.report()``.

        Note: Already reported results may still be enqueued at this point,
              and should be handled appropriately.
        c                  H    t          d          } |                                 S )Npause_reporting)r  r'  r  s    r3   pause_session_reportingz@BackendExecutor.pause_reporting.<locals>.pause_session_reporting  s"    "#455G**,,,r2   NrX   r  r  )rc   r(  r   s      r3   r'  zBackendExecutor.pause_reportingz  sD    	- 	- 	- #112IJJ&&w/////r2   c                 j    d }| j                             |          }|                     |          }|S )at  Finish training and return final results. Propagate any exceptions.

        Blocks until training is finished on all workers.

        Assumes `start_training` has already been called.

        Returns:
            A list of return values from calling ``train_func`` on each worker.
                Each item corresponds to the return value from a single worker.
        c                      t          d          } 	 |                                 }t                       n# t                       w xY w|S )Nfinish_training)r  finishr   )r  outputs     r3   end_trainingz5BackendExecutor.finish_training.<locals>.end_training  sM    "#455G# )) !"""" """"Ms	   4 Ar)  )rc   r/  r   r$  s       r3   r,  zBackendExecutor.finish_training  sA    
	 
	 
	 #11,??0099r2   Ferroredfailed_rankstack_tracec           	         | j         rddlm}m} |r%|j        }d}|	|d| dz  }|||| d         z  }n	|j        }d}| j                            | j        j	        ||t          t          j                    dz                       dS dS )	zJReport the final train run status, error, and end time to TrainStateActor.r   )MAX_ERROR_STACK_TRACE_LENGTHr    NzRank z worker raised an error. 
rI   )r   r   status_detailend_time_ms)rb   r	  r4  r   ERROREDFINISHEDr   end_train_runrV   r   r_   r`   )rc   r0  r1  r2  r4  r   r   r6  s           r3   report_final_run_statusz'BackendExecutor.report_final_run_status  s     & 	       
  
#*2
 "*!%U[%U%U%UUM*![2N1N1O1O%PPM*3
 ",,'.%+	d 233	 -     %	 	r2   c                     t          |          \  }}|rt          j        |          S || _        |                                  t
                              d           |                                  t          )a  Gets the remote values while handling for worker failures.

        This method should be called instead of ``ray.get()`` directly in
        order to handle worker failures.

        If a worker failure is identified, backend specific failure handling
        is executed and a ``TrainingWorkerError`` is raised.

        Args:
            remote_values: List of object refs representing functions
                that may fail in the middle of execution. For example, running
                a Train training loop in multiple parallel actor calls.
        Returns:
            The resolved objects represented by the passed in ObjectRefs.
        zjFailure identified during training. Restarting all workers and continuing training from latest checkpoint.)	r   r   r   rS   r   r   r   r   r5   )rc   remote_valuessuccessr   s       r3   r  z)BackendExecutor.get_with_failure_handling  sx      /}== 
	&7=)))!*D$$&&&NN>   MMOOO%%r2   Tgraceful_terminationc                    |rQ	 | j                             | j        | j                   n*# t          $ r t
                              d           Y nw xY w|r| j                                         n| j                            d           t                      | _        | j	        rt          | j	                   d| _	        d| _        dS )zShuts down the workers in the worker group.

        Args:
            graceful_termination: If set to True, attempt to clean up the backend
                before terminating the Ray actors.

        zXGraceful shutdown of backend failed. This is expected if one of the workers has crashed.r   )
patience_sN)rN   on_shutdownrX   rL   r   r   r   shutdownrW   rU   r(   rY   )rc   r?  s     r3   rC  zBackendExecutor.shutdown  s       	))$*;T=QRRRR    B       	5&&((((&&!&444/11  	)"4#8999$(D!"s   %* $AAc                 8    t          | j        t                     S rr   )
isinstancerX   rW   r   s    r3   
is_startedzBackendExecutor.is_started  s    d/1DEEEEr2   c                     | j                                          | j        | j        }nd }| j        rt	          | j                   d | _        |                     |           d S )N)re   )rX   rC  rT   rU   r(   r   )rc   re   s     r3   r   zBackendExecutor._restart  sq    ""$$$$0"&";"&  	)"4#8999$(D!

':
;;;;;r2   c                     | xj         dz  c_         | j         | j        k    rI| j        }d | _        | j        dk    r.t          d| j          d          }|                    d           ||d S )Nr?   r   zTraining has failed after z
 attempts.)rR   rP   rS   r  with_traceback)rc   failurer   s      r3   r   z#BackendExecutor._increment_failures  s    a!333(G!%D!A%%"O1COOO  ((..G; 43r2   c                     | j         S rr   )rX   r   s    r3   get_worker_groupz BackendExecutor.get_worker_group  s      r2   c                     | j         S rr   )rR   r   s    r3   _get_num_failuresz!BackendExecutor._get_num_failures  s    !!r2   )Nr?   Nr@   )NNNNrr   )FNN)T)*r-   r.   r/   r0   r   r	   r   r_   r   r;   rQ   rd   r   r   r
   r   rx   r   r   r   r   r   r)   r   r   r   r   r   r  r   r  r'  r,  r~   r;  r  rC  rF  r   r   rL  rN  r1   r2   r3   r>   r>   I   s7        * +/;?5V 5V%5V Y'	5V
 5V 'tCJ'785V 5V 5V 5V 5Vr =A$(*.+/RU RU%hr4x&89RU D>RU !	RU
 #4.RU RU RU RUh24 24 24hR R R43C 3# 3 3 3 3j
 
RU 
 
 
 
"PC$t* PC PC PC PCr ,0~5 ~5RUO~5 sG|$~5 sCx.	~5
  ~5  ~5 Z(~5 
~5 ~5 ~5 ~5@,(4+@"A , , , ,\0 0 0  < %)%)	  c] c]	   @& & &:# #T # # # #:F F F	< 	< 	<  ! ! !" " " " "r2   r>   c                       e Zd ZdZdS )InactiveWorkerGroupErrorz0Raised when underlying worker group is inactive.Nr,   r1   r2   r3   rP  rP     s        ::::r2   rP  c                   &    e Zd Zd Zd Zd Zd ZdS )rW   c                      t          |           S rr   )varsr   s    r3   __getstate__z InactiveWorkerGroup.__getstate__)  s    Dzzr2   c                 J    t          |                               |           d S rr   )rS  r   )rc   states     r3   __setstate__z InactiveWorkerGroup.__setstate__,  s"    T

%     r2   c                     t                      rr   rP  )rc   names     r3   __getattr__zInactiveWorkerGroup.__getattr__/      &(((r2   c                     t                      rr   rY  r   s    r3   __len__zInactiveWorkerGroup.__len__2  r\  r2   N)r-   r.   r/   rT  rW  r[  r^  r1   r2   r3   rW   rW   $  sP        
  ! ! !) ) )) ) ) ) )r2   rW   method_namec                 R    t                      }|st          d|  d|  d          |S )N`zP` has been called before `start_training`. Please call `start_training` before `z`.)r   r+   )r_  r  s     r3   r  r  6  sQ    mmG 
           
 
 	
 Nr2   )Kloggingr   r`   collectionsr   dataclassesr   typingr   r   r   r   r	   r
   r   r   r   ray._private.ray_constantsr   rZ   !ray._private.accelerators.amd_gpur    ray._private.accelerators.neuronr   ray._private.accelerators.npur   $ray._private.accelerators.nvidia_gpur   r   r|   r   ray.exceptionsr   	ray.trainr   r   ray.train._internal.sessionr   r   r   r   r   ray.train._internal.storager   ray.train._internal.utilsr    ray.train._internal.worker_groupr   ray.train.backendr   ray.train.constantsr   r    r!   r"   r#   r$   r%   r&   ray.util.placement_groupr'   r(   r)   	getLoggerr-   r   	Exceptionr+   r5   r7   r>   rP  rW   r;   r  r1   r2   r3   <module>rv     s    				  # # # # # # ! ! ! ! ! ! L L L L L L L L L L L L L L L L L L L L 



 2 2 2 2 2 2 2 2 2 I I I I I I L L L L L L K K K K K K M M M M M M 2 2 2 2 2 2       ( ( ( ( ( ( , , , , , , , ,              7 6 6 6 6 6 7 7 7 7 7 7 8 8 8 8 8 8 + + + + + +	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 Y X X X X X X XGCLL		8	$	$J J J J J	 J J J4 4 4 4 4) 4 4 4 $ $ $ $ $ $ $ $$T" T" T" T" T" T" T" T"n; ; ; ; ;y ; ; ;) ) ) ) ) ) ) )$c      r2   