
    &`it                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZmZmZ d dlZd dlmc mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lm Z  d d
l!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>  ej?        e@          ZAdeBfdZC G d d          ZDdS )    N)AnyAsyncIteratorDictOptionalUnion)Timerrun_background_task)'NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR)"NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR)get_event_logger)	GcsClient)ActorHandle)Event)!DEFAULT_JOB_START_TIMEOUT_SECONDS,RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR%RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR4RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR)JOB_ACTOR_NAME_TEMPLATESUPERVISOR_ACTOR_RAY_NAMESPACEJobInfoJobInfoStorageClient)JobLogStorageClient)JobSupervisor)get_head_node_id)close_logger_file_descriptor)ActorDiedErrorActorUnschedulableErrorRuntimeEnvSetupError)JobErrorType	JobStatus)RuntimeEnvConfig)NodeAffinitySchedulingStrategySchedulingStrategyTreturnc                      t          j                    } t          t          t          j        t          j        z             h dz
            }d                    |                     |d                    }d| S )zReturns a job_id of the form 'raysubmit_XYZ'.

    Prefixed with 'raysubmit' to avoid confusion with Ray JobID (driver ID).
    >   0IOlo    )k
raysubmit_)	randomSystemRandomlistsetstringascii_lettersdigitsjoinchoices)randpossible_charactersid_parts      y/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/dashboard/modules/job/job_manager.pygenerate_job_idr<   /   sw    
   DF 6=011
#
#
#	$  ggdll#6"l==>>G!!!!    c                      e Zd ZdZdZdZdZ	 d+dedede	fdZ
d	ed
ej        fdZd Zd	ed
ee         fdZ	 d+d	edee         fdZ	 d+d	edee         fdZd	edee         fdZ	 d,deeef         deded
eeef         fdZded
efdZddddddddddedee         deeeef                  deeeef                  deeeef                  deeeef                  dee         d eeeef                  d!ee         d
efd"Zd
efd#Z d$ Z!d
e"fd%Z#d	ed
ee$         fd&Z%d	ed
ee&         fd'Z'd
eee&f         fd(Z(d	ed
efd)Z)d	ed
e*e         fd*Z+dS )-
JobManagerzProvide python APIs for job submission and management.

    It does not provide persistence, all info will be lost if the cluster
    goes down.
       g?N
gcs_clientlogs_dirtimeout_check_timerc                 >   || _         || _        t          ||          | _        |j        | _        |j                                        | _        t                      | _
        t          j        t                    | _        |pt                      | _        t#                      | _        	 t'          t(          j        j        |          | _        n# t0          $ r
 d | _        Y nw xY wt3          j                    | _        t7          |                                            d S N)_gcs_client	_logs_dirr   _job_info_clientaddress_gcs_address
cluster_idhex_cluster_id_hexr   _log_clientrayremoter   _supervisor_actor_clsr   _timeout_check_timerr2   monitored_jobsr   r   
SourceTypeJOBSevent_logger	Exceptionasyncio_recover_running_jobs_eventr	   _recover_running_jobs)selfrA   rB   rC   s       r;   __init__zJobManager.__init__J   s     &! 4Z J J&.)488::.00%(Z%>%>"$7$B577!!ee	% 01A1F Q QD 	% 	% 	% $D	% ,3=??(D668899999s   ($C C! C!job_idr$   c                 <   t          j        t           d|           }|j        sv| j                            |          }t          j        |          }t          j        t          j	                  }|
                    |           |                    |           |S )zReturn job driver logger to log messages to the job driver log file.

        If this function is called for the first time, configure the logger.
        z.driver-)logging	getLogger__name__handlersrN   get_log_file_pathFileHandler	Formatterray_constantsLOGGER_FORMATsetFormatter
addHandler)r[   r]   job_driver_loggerjob_driver_log_pathjob_driver_handlerjob_driver_formatters         r;   _get_job_driver_loggerz!JobManager._get_job_driver_logger^   s    
 $-.K.K6.K.KLL !) 	="&"2"D"DV"L"L!(!45H!I!I#*#4]5P#Q#Q ++,@AAA(();<<<  r=   c                 d  K   	 | j                                          d{V }|                                D ]@\  }}|j                                        s"t          |                     |                     A	 | j                                         dS # | j                                         w xY w)zRecovers all running jobs from the status client.

        For each job, we will spawn a coroutine to monitor it.
        Each will be added to self._running_jobs and reconciled.
        N)	rH   get_all_jobsitemsstatusis_terminalr	   _monitor_jobrY   r2   )r[   all_jobsr]   job_infos       r;   rZ   z JobManager._recover_running_jobso   s      		3!2??AAAAAAAAH$,NN$4$4 C C 2244 C'(9(9&(A(ABBBC ,0022222D,002222s   A4B B/c                     	 t          j        t          j        |          t                    S # t
          $ r Y d S w xY w)Nr]   )	namespace)rO   	get_actorr   formatr   
ValueErrorr[   r]   s     r;   _get_actor_for_jobzJobManager._get_actor_for_job   sX    	='.f===8     	 	 	44	s   -0 
>>job_supervisorc                   K   || j         v r t                              d| d           dS | j                             |           	 |                     ||           d{V  n0# t
          $ r#}t                              d|           |d}~ww xY w	 | j                             |           dS # | j                             |           w xY w)zMonitors the specified job until it enters a terminal state.

        This is necessary because we need to handle the case where the
        JobSupervisor dies unexpectedly.
        zJob z is already being monitored.Nz&Unhandled exception in job monitoring!exc_info)rS   loggerdebugadd_monitor_job_internalrW   errorremove)r[   r]   r   es       r;   rt   zJobManager._monitor_job   s       T(((LLDDDDEEEF'''	/,,V^DDDDDDDDDD 	 	 	LLAALNNNG	 E
 &&v.....D&&v....s*   A$ #B1 $
B.BBB1 1Cc                 ~
  K   t          t          j                            t          t
                              }d }d }d }	 	 t          j        | j                   d {V  | j	        
                    |d            d {V }|t          j        k    r|"| j	                            |d            d {V }| j                                        |j        dz  z
  |k    rd| dt           d}|j        d uo
|j        dk    pH|j        d uo
|j        dk    p4|j        d uo
|j        dk    p |j        d uot+          |j                  dk    }|r|dz  }| j	                            |t          j        |t0          j        d 	           d {V  t4                              |           ne||                     |          }|k|t          j        k    rt4                              d
| d           | j	                            |t          j        dt0          j        d 	           d {V  n|-|j                            d                                           }tC          j"        |gd          \  }	}
|	rtC          j        |           d }nOny# tF          $ rk}| j	        
                    |d            d {V }d}d }||$                                rnDtK          |tL                    r0t4                              d| d           d| }t0          j'        }ntK          |tP                    r2t4                              d| d|            d| }t0          j)        }n{tK          |tT                    r2t4                              d| d|            d| }t0          j+        }n4t4                              d| d| d|           d| }t0          j,        }t          j        }| j	                            ||||pt0          j,        d 	           d {V  |r| j-        .                    |          }t          j/        t          j0        1                    |          d           te          |d          5 }|3                    |           d d d            n# 1 swxY w Y   | j4        rLd| d| d}|r%|d| z  }| j4                            ||            n| j4        5                    ||            Y d }~n
d }~ww xY w||                     |          }|tC          j6        |d!           d S d S )"NT)timeout  z,Job supervisor actor failed to start within zM seconds. This timeout can be configured by setting the environment variable .r   a)   This may be because the job entrypoint's specified resources (entrypoint_num_cpus, entrypoint_num_gpus, entrypoint_resources, entrypoint_memory)aren't available on the cluster. Try checking the cluster's available resources with `ray status` and specifying fewer resources for the job entrypoint.)message
error_typer   z%Failed to get job supervisor for job z8Unexpected error occurred: failed to get job supervisor.)max_task_retriesr+   z%Failed to set up runtime_env for job zruntime_env setup failed: zFailed to schedule job z6 because the supervisor actor could not be scheduled: z-Job supervisor actor could not be scheduled: zJob supervisor actor for z died: zJob supervisor actor died: zJob monitoring for job z failed unexpectedly: r   zUnexpected error occurred: )exist_okazCompleted a ray job z with a status  submission_id)
no_restart)7floatosenvirongetr   r   rX   sleepJOB_MONITOR_LOOP_PERIOD_SrH   
get_statusr    PENDINGget_inforR   time
start_timeentrypoint_num_cpusentrypoint_num_gpusentrypoint_memoryentrypoint_resourceslen
put_statusFAILEDr   "JOB_SUPERVISOR_ACTOR_START_TIMEOUTr   r   r~   "JOB_SUPERVISOR_ACTOR_START_FAILUREpingoptionsrP   rO   waitrW   rs   
isinstancer   RUNTIME_ENV_SETUP_FAILUREr   "JOB_SUPERVISOR_ACTOR_UNSCHEDULABLEr   JOB_SUPERVISOR_ACTOR_DIED$JOB_SUPERVISOR_ACTOR_UNKNOWN_FAILURErN   rc   makedirspathdirnameopenwriterV   infokill)r[   r]   r   r   
job_statusrv   ping_obj_referr_msgresources_specifiedready_r   target_job_error_messagetarget_job_error_typelog_pathlog_file	event_logs                    r;   r   z JobManager._monitor_job_internal   s      JNN51 
 
 
u	t
 mD$BCCCCCCCCC#'#8#C#CD $D $ $      
 !222  ')-)>)G)G"D *H * * $ $ $ $ $ $
 166888;NQU;UU!" "Q&Q Q )NQ Q Q   !) <D H !E$,$@1$D
 !) <D H !E$,$@1$D !) :$ F !C$,$>$B !) =T I !K$'(E$F$F$J ,$ / 	#!2G #3>>"%,$+'3'V$( ?          W---!)%)%<%<V%D%DN!)!Y%666 ! %VV%V%V%VWWW"3>>"%,!@ (4'V$( ? 	 	 	 	 	 	 	 	 	   '#1#6#>#>)+ $? $ $fhh ! 8\NA>>>q GL)))#'LL !  K K K#'#8#C#CD $D $ $      
 ,.(@D%)j.D.D.F.F) !!%9:: "%VV%V%V%VWWW3SPQ3S3S00<0V--#A'>?? Af A A=>A A   PAOO 1 )K .- $A~66 %S%S%SPQ%S%STTT3TQR3T3T00<0V-- 2f 2 2-.2 2 2%& %    4UQR3T3T0(M . "+!1J/::" 8#8 $M'L $ ;          , A#/AA&IIHK 9 9DIIIIh,, A '?@@@A A A A A A A A A A A A A A A $ PSvSSjSSS  0 P!%C)A%C%CC	)//	/PPPP)..y.OOOWKUu	p !!44V<<N%H^555555 &%sS   E#J ')J AJ +A"J TG!T=RTR#	#T&R#	'ATTresultc                     |dS dS )zHandle the result of starting a job supervisor actor.

        If started successfully, result should be None. Otherwise it should be
        an Exception.

        On failure, the job will be marked failed with a relevant error
        message.
        N )r[   r]   r   s      r;   _handle_supervisor_startupz%JobManager._handle_supervisor_startupi  s     >F >r=   Fuser_runtime_envr   r   c                    |t          j        |          ni }|                    d          }|i }d|t          j        <   |sd|t
          <   d|t          <   ||d<   t          j        t          d          dk    rH|                    d          }|t                      }| j                            |          g|d<   ||d<   |S )aP  Configure and return the runtime_env for the supervisor actor.

        Args:
            user_runtime_env: The runtime_env specified by the user.
            resources_specified: Whether the user specified resources in the
                submit_job() call. If so, we will skip the workaround introduced
                in #24546 for GPU detection and just use the user's resource
                requests, so that the behavior matches that of the user specifying
                resources for any other actor.

        Returns:
            The runtime_env for the supervisor actor.
        Nenv_varsr&   1config	log_files)copydeepcopyr   rf   RAY_WORKER_NICENESSr   r
   r   getenvr   r!   rN   rc   )r[   r   r   r   runtime_envr   r   s          r;   _get_supervisor_runtime_envz&JobManager._get_supervisor_runtime_envu  s    * 0@/KDM*+++QS 	 ??:..H6923" 	D <?H78@CH<="*J9I3OOSVVV __X..F~)++#'#3#E#Em#T#T"UF;$*K!r=   c                 z  K   |rdS t           j                            t          d          dk    r$t                              t           d           dS t          | j                   d{V }|t                              d           d}n.t                              d|            t          |d	          }|S )
a  Get the scheduling strategy for the job.

        If resources_specified is true, or if the environment variable is set to
        allow the job to run on worker nodes, we will use Ray's default actor
        placement strategy. Otherwise, we will force the job to use the head node.

        Args:
            resources_specified: Whether the job specified any resources
                (CPUs, GPUs, or custom resources).

        Returns:
            The scheduling strategy to use for the job.
        DEFAULTr&   r   zw was set to 1. Using Ray's default actor scheduling strategy for the job driver instead of running it on the head node.NzHead node ID not found in GCS. Using Ray's default actor scheduling strategy for the job driver instead of running it on the head node.z>Head node ID found in GCS; scheduling job driver on head node F)node_idsoft)	r   r   r   r   r   r   r   rF   r"   )r[   r   head_node_idscheduling_strategys       r;   _get_scheduling_strategyz#JobManager._get_scheduling_strategy  s        	9:>>FLLPSSSKK? A A A  
 9
 .d.>????????KK'  
 #,KK,), ,   #A$5# # # #"r=   )r   r   metadatar   r   r   r   _start_signal_actor
entrypointr   r   r   r   r   r   r   c       	           K   |d}|d}|d}|t                      }| j                                         d{V  t                              d|            t          |t          j        t          t          j	                    dz            ||||||	  	        }
| j
                            ||
d           d{V }|st          d| d	          |                     |          }	 t          |duo|dk    |duo|dk    |duo|dk    |di fvg          }|                     |           d{V }| j        r | j                            d
| d|           |                    d           | j                            dt'          j        |          ||||||                     |||          t,          d
  
                            |||pi | j        | j        | j                  }|j                            |	|           t9          |                     ||                     n# t<          $ r|}t?          j                     }|!                    d| d| d|            | j
        "                    |t          j#        d| d| d| tH          j%                   d{V  Y d}~nd}~ww xY wtM          |           n# tM          |           w xY w|S )a^  
        Job execution happens asynchronously.

        1) Generate a new unique id for this job submission, each call of this
            method assumes they're independent submission with its own new
            ID, job supervisor actor, and child process.
        2) Create new detached actor with same runtime_env as job spec

        Actual setting up runtime_env, subprocess group, driver command
        execution, subprocess cleaning up and running status update to GCS
        is all handled by job supervisor actor.

        Args:
            entrypoint: Driver command to execute in subprocess shell.
                Represents the entrypoint to start user application.
            runtime_env: Runtime environment used to execute driver command,
                which could contain its own ray.init() to configure runtime
                env at ray cluster, task and actor level.
            metadata: Support passing arbitrary data to driver command in
                case needed.
            entrypoint_num_cpus: The quantity of CPU cores to reserve for the execution
                of the entrypoint command, separately from any tasks or actors launched
                by it. Defaults to 0.
            entrypoint_num_gpus: The quantity of GPUs to reserve for
                the entrypoint command, separately from any tasks or actors launched
                by it. Defaults to 0.
            entrypoint_memory: The amount of total available memory for workers
                requesting memory the entrypoint command, separately from any tasks
                or actors launched by it. Defaults to 0.
            entrypoint_resources: The quantity of various custom resources
                to reserve for the entrypoint command, separately from any tasks or
                actors launched by it.
            _start_signal_actor: Used in testing only to capture state
                transitions between PENDING -> RUNNING. Regular user shouldn't
                need this.

        Returns:
            job_id: Generated uuid for further job management. Only valid
                within the same ray cluster.
        Nr   z!Starting job with submission_id: r   )	r   rr   r   r   r   r   r   r   r   F)	overwritezJob with submission_id z6 already exists. Please use a different submission_id.zStarted a ray job r   r   zRuntime env is setting up.detachedrx   )
lifetimenamenum_cpusnum_gpusmemory	resourcesr   r   ry   enable_task_events)r   r   )r   z)Failed to start supervisor actor for job z: 'z'. Full traceback:
z!Failed to start supervisor actor )r   r   )'r<   rY   r   r   r   r   r    r   intr   rH   put_infor|   rn   anyr   rV   rQ   r   r   r{   r   r   rP   rJ   rM   rG   runr	   rt   rW   	traceback
format_excwarningr   r   r   r   r   )r[   r   r   r   r   r   r   r   r   r   rv   new_key_addeddriver_loggerr   r   
supervisorr   tb_strs                     r;   
submit_jobzJobManager.submit_job  s_     j &"#&"#$ ! +--M .33555555555GGGHHH!$49;;-..# 3 3/!5

 

 

 #3<<8u = 
 
 
 
 
 
 
 
  	8- 8 8 8  
 33MBBC	8"%'t3O8Ka8O't3O8Ka8O%T1K6G!6K(r
:	# # )-(E(E#) ) # # # # # #   !&&9999 '    ;<<<3;;#,3=III,,(.$7 <<0C  9 $) <   fB!$  . N!!$7$7 "     !!-
!KK     	 	 	)++F!!/M / /a / /&,/ /   '22 3 3 3! 3 3*03 3 (J 3              	  )7777(7777s2   ,E H- ,K -
J37A2J.)K .J33K Kc                 j    |                      |          }||j                                         dS dS )zeRequest a job to exit, fire and forget.

        Returns whether or not the job was running.
        NTF)r~   stoprP   )r[   r]   job_supervisor_actors      r;   stop_jobzJobManager.stop_joby  s?    
  $66v>>+ !%,,...45r=   c                    K   | j                             |           d{V }||                                st          d| d| d          | j                             |           d{V  dS )z2Delete a job's info and metadata from the cluster.NzAttempted to delete job 'z%', but it is in a non-terminal state r   T)rH   r   rs   RuntimeErrordelete_info)r[   r]   r   s      r;   
delete_jobzJobManager.delete_job  s      0;;FCCCCCCCC
Z%;%;%=%=CF C C5?C C C  
 #//777777777tr=   c                     | j         S rE   )rH   r[   s    r;   job_info_clientzJobManager.job_info_client  s    $$r=   c                 F   K   | j                             |           d{V S )zGet latest status of a job.N)rH   r   r}   s     r;   get_job_statuszJobManager.get_job_status  s/      *55f=========r=   c                 F   K   | j                             |           d{V S )zGet latest info of a job.N)rH   r   r}   s     r;   get_job_infozJobManager.get_job_info  s/      *33F;;;;;;;;;r=   c                 D   K   | j                                          d{V S )zGet info for all jobs.N)rH   rp   r   s    r;   	list_jobszJobManager.list_jobs  s-      *77999999999r=   c                 6    | j                             |          S )zGet all logs produced by a job.)rN   get_logsr}   s     r;   get_job_logszJobManager.get_job_logs  s    ((000r=   c                  K   |                      |           d{V t          d| d          d}| j                            |          2 3 d{V }|V|r dS |                      |           d{V }|                                rd}t          j        | j                   d{V  _d                    |          W V  x6 dS )z/Return an iterator following the logs of a job.NzJob 'z' does not exist.FTr+   )	r  r   rN   	tail_logsrs   rX   r   LOG_TAIL_SLEEP_Sr6   )r[   r]   job_finishedlinesrr   s        r;   tail_job_logszJobManager.tail_job_logs  s)     $$V,,,,,,,,4@v@@@AAA+55f== 	% 	% 	% 	% 	% 	% 	%%} , FF#'#6#6v#>#>>>>>>>F))++ ,'+ mD$9::::::::::ggenn$$$$$ >==s   CrE   )F),ra   
__module____qualname____doc__r  r   WAIT_FOR_ACTOR_DEATH_TIMEOUT_Sr   strr   r\   r_   Loggerrn   rZ   r   r   r~   rt   r   rW   r   r   r   boolr   r#   r   r   r   r   r   r   r   r   r   r    r  r   r  r  r	  r   r  r   r=   r;   r?   r?   =   s          !%(" RV: :#:/2:IN: : : :(!S !W^ ! ! ! !"3 3 3" +1F     DH/ //+3K+@/ / / /, DHI6 I6I6+3K+@I6 I6 I6 I6V
 
hy>Q 
 
 
 
  %*	0 0sCx.0 0 "	0
 
c3h0 0 0 0d.##'.#	.# .# .# .#h (,04-1;?;?+/;?59` ` ` `  }	`
 d38n-` 4S>*` &eCJ&78` &eCJ&78` $C=` 'tCJ'78` &k2` 
` ` ` `D$      %!5 % % % %>3 >8I3F > > > >< <'1B < < < <:c7l!3 : : : :13 13 1 1 1 1%# %-2D % % % % % %r=   r?   )ErX   r   r_   r   r/   r3   r   r   typingr   r   r   r   r   rO   ray._private.ray_constants_privaterf   ray._common.utilsr   r	   ray._private.accelerators.npur
   $ray._private.accelerators.nvidia_gpur   ray._private.event.event_loggerr   ray._rayletr   	ray.actorr   ray.core.generated.event_pb2r   ray.dashboard.constsr   r   r   r    ray.dashboard.modules.job.commonr   r   r   r   0ray.dashboard.modules.job.job_log_storage_clientr   (ray.dashboard.modules.job.job_supervisorr   ray.dashboard.modules.job.utilsr   ray.dashboard.utilsr   ray.exceptionsr   r   r   ray.job_submissionr   r    ray.runtime_envr!   ray.util.scheduling_strategiesr"   r#   r`   ra   r   r  r<   r?   r   r=   r;   <module>r+     s      				        < < < < < < < < < < < < < < 



 2 2 2 2 2 2 2 2 2 8 8 8 8 8 8 8 8 Q Q Q Q Q Q S S S S S S < < < < < < ! ! ! ! ! ! ! ! ! ! ! ! . . . . . .                       Q P P P P P B B B B B B < < < < < < < < < < < < X X X X X X X X X X 6 6 6 6 6 6 6 6 , , , , , ,       
 
	8	$	$" " " " "	% 	% 	% 	% 	% 	% 	% 	% 	% 	%r=   