
    &`iR                     2   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZ d dlZd dlmc mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/Z/	 e j0        Z0n# e1$ r
 e j2        Z0Y nw xY wej3        dk    rP	 d dl4Z4d dl5Z5d dl6Z6nB# e7e8f$ r8Z9dZ4dZ5dZ6 ej:        e;          Z<e<=                    de9            Y dZ9[9ndZ9[9ww xY w G d d          Z>dS )    N)FIRST_COMPLETED)AnyDictListOptional)CoreContextFilter)JSONFormatterTextFormatter)build_address)'NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR)"NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR)RAY_JOB_CONFIG_JSON_ENV_VAR)"remove_ray_internal_flags_from_env)	GcsClient)ActorHandle)JOB_ID_METADATA_KEYJOB_NAME_METADATA_KEYJobInfoStorageClient)JobLogStorageClient)JobErrorType	JobStatuswin32z`Failed to Import win32api. For best usage experience run 'conda install pywin32'. Import error: c                   f   e Zd ZdZdZdZddgZ	 d!deded	eeef         d
edede	e         fdZ
d"dZ	 d#dedeeef         fdZd Zdededej        fdZdedeeef         fdZdej        defdZdeej                 fdZdeej                 dej        fdZ	 	 d$de	e         defdZd  Z dS )%JobSupervisoram  
    Ray actor created by JobManager for each submitted job, responsible to
    setup runtime_env, execute given shell command in subprocess, update job
    status, persist job logs and manage subprocess group cleaning.

    One job supervisor actor maps to one subprocess, for one job_id.
    Job supervisor actor should fate share with subprocess it created.
       g?SIGINTSIGTERMNjob_id
entrypointuser_metadatagcs_addresscluster_id_hexlogs_dirc                    || _         t          ||          }t          ||          | _        t	                      | _        || _        t          |t          |i| _	        | j	        
                    |           t          j                    | _        d | _        t          j        t"           d|           | _        |                                  d S )N)address
cluster_idz.supervisor-)_job_idr   r   _job_info_clientr   _log_client_entrypointr   r   	_metadataupdateasyncioEvent_stop_event_win32_job_objectlogging	getLogger__name___logger_configure_logger)selfr   r   r    r!   r"   r#   
gcs_clients           |/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/dashboard/modules/job/job_supervisor.py__init__zJobSupervisor.__init__G   s     {~NNN
 4Z J J.00% .v7LfUm,,, #=?? "& (H)J)J&)J)JKK         returnc                    t           j                            t          j        j        j                                        d| j         d          }t          j	        t           j        
                    |          d           | j                            t                                 t          j                    }t          j        |          }t#                      }t%          j        t$          j        d          rt+                      }|                    |           |                    |           | j                            |           | j                            |           d| j        _        dS )zx
        Configure self._logger object to write logs to file based on job
        submission ID and to console.
        zjobs/supervisor-z.logT)exist_okFN)ospathjoinray_privateworker_global_nodeget_logs_dir_pathr'   makedirsdirnamer4   	addFilterr   r1   StreamHandlerFileHandlerr
   ray_constantsenv_boolRAY_BACKEND_LOG_JSON_ENV_VARr	   setFormatter
addHandler	propagate)r6   supervisor_log_file_namestream_handlerfile_handler	formatters        r8   r5   zJobSupervisor._configure_loggere   s1   
 $&7<<L,>>@@1t|111$
 $
  	BGOO$<==MMMM022333 .00*+CDD!OO	!-"LeTT 	(%I##I...!!),,,///---!&r:   Fresources_specifiedc                 4   t          t          j                    j                  }|r|S |                    di           }|                    t                     |                    t                     |                    t          j	                   ||d<   |S )a  Get the runtime env that should be set in the job driver.

        Args:
            resources_specified: Whether the user specified resources (CPUs, GPUs,
                custom resources) in the submit_job request. If so, we will skip
                the workaround for GPU detection introduced in #24546, so that the
                behavior matches that of the user specifying resources for any
                other actor.

        Returns:
            The runtime env that should be set in the job driver.
        env_vars)
dictrA   get_runtime_contextruntime_envgetpopr   r   rK   RAY_WORKER_NICENESS)r6   rU   curr_runtime_envrW   s       r8   _get_driver_runtime_envz%JobSupervisor._get_driver_runtime_env{   s        7 9 9 EFF 	$## $''
B777888<===]6777'/$r:   c                     dS )z&Used to check the health of the actor.N r6   s    r8   pingzJobSupervisor.ping   s    r:   env	logs_pathc                 4   t          |d          5 }|                    d| j         d| j         d           t	          j        | j        dd|t          j        |t          j        dk    r&t          j
                            d          dk    rd	 nd
          }t          j                    }|j        }t          j        dk    rg	 t          j        |          }n# t          $ r |cY cd
d
d
           S w xY wt	          j        d| d| dt          j        t          j                   nt          j        dk    rt"          rt%          j        d
d          | _        t%          j        | j        t$          j                  }t$          j        |d         d<   t%          j        | j        t$          j        |           t#          j        t4          j        t4          j        z  d|          }	t%          j        | j        |	           |cd
d
d
           S # 1 swxY w Y   d
S )a  
        Runs the entrypoint command as a child process, streaming stderr &
        stdout to given log files.

        Unix systems:
        Meanwhile we start a demon process and group driver
        subprocess in same pgid, such that if job actor dies, entire process
        group also fate share with it.

        Windows systems:
        A jobObject is created to enable fate sharing for the entire process group.

        Args:
            logs_path: File path on head node's local disk to store driver
                command's stdout & stderr.
        Returns:
            child_process: Child process that runs the driver command. Can be
                terminated or killed upon user calling stop().
        azRunning entrypoint for job z: 
Tr   RAY_JOB_STOP_SIGNALr   c                  V    t          j        t           j        t           j        h          S N)signalpthread_sigmaskSIG_UNBLOCKr   ra   r:   r8   <lambda>z0JobSupervisor._exec_entrypoint.<locals>.<lambda>   s     6".! ! r:   N)shellstart_new_sessionstdoutstderrrd   
preexec_fnzwhile kill -s 0 z; do sleep 1; done; kill -9 -)rp   rr   rs    BasicLimitInformation
LimitFlagsF)openwriter'   r*   
subprocessPopenSTDOUTsysplatformr>   environr[   getpidpidgetpgidProcessLookupErrorDEVNULLwin32apiwin32jobCreateJobObjectr0   QueryInformationJobObject!JobObjectExtendedLimitInformation"JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSESetInformationJobObjectOpenProcesswin32conPROCESS_TERMINATEPROCESS_SET_QUOTAAssignProcessToJobObject)
r6   rd   re   	logs_filechild_process
parent_pid	child_pid
child_pgidwin32_job_infochild_handles
             r8   _exec_entrypointzJobSupervisor._exec_entrypoint   s   , )S!! F	!YOORdlRRd>NRRR   ', "& !( |w..
'<==II   #  M( J%)I |w&&)!#I!6!6JJ) ) ) )(((CF	! F	! F	! F	! F	! F	! F	! F	!>)  \z\\PZ\\%-%-     ((X(
 *2)A$)K)K&!)!C*H,V" "
 ? 67  0*>"  
  (3.1KK   
 1$2H,WWW MF	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	! F	!s7   B1HCHC3$H2C33DHHHc                 z   t           j        t          j        v rdt          j        t           j        <   t          j        j                            dt          j        j        j	        j
        j                  }|J t          t          j        |                     |          | j        d          t           j        |ddiS )z?Returns environment variables that should be set in the driver.autoN)rZ   metadataPYTHONUNBUFFERED1)rK    RAY_ADDRESS_ENVIRONMENT_VARIABLEr>   r   rA   rB   services%canonicalize_bootstrap_address_or_dierC   rD   _ray_paramstemp_dirr   jsondumpsr_   r+   )r6   rU   ray_addrs      r8   _get_driver_env_varsz"JobSupervisor._get_driver_env_vars   s     9RZGGIOBJ}EF<(NNCL'4@I
 
 ### (#'#?#?@S#T#T $ * * :H !
 	
r:   r   c                    K   |;|                                 }||S t          j        | j                   d {V  |9d S d S rk   )pollr-   sleepSUBPROCESS_POLL_PERIOD_S)r6   r   return_codes      r8   _pollingzJobSupervisor._polling  sd      '',,..K&"" mD$ABBBBBBBBB '''''r:   	processesc                    K   	 t          j        |d          \  }}t          |          dk    rdS t          j        | j                   d{V  N)z'Poll processes until all are completed.Tr   )timeoutN)psutil
wait_procslenr-   r   r   )r6   r   _alives       r8   	_poll_allzJobSupervisor._poll_all#  sd      	C*9a@@@JQ5zzQmD$ABBBBBBBBB	Cr:   sigc                 f    |D ]-}	 t          j        |j        |           # t          $ r Y *w xY wdS )z>Ensure each process is already finished or send a kill signal.N)r>   killr   r   )r6   r   r   procs       r8   _kill_processeszJobSupervisor._kill_processes,  sW     	 	D#&&&&%   	 	s   !
.._start_signal_actorc           
        K   | j                             | j                   d{V }|t          d| j         d          |j        }|j        }|t          j        k    rt          d| j         d          |t          j        k    rt          d| j         d| d| d          |r|j	        
                                 d{V  t          j        j        j        j        }dt!          |j        |j                   }t          j                                                    }| j                             | j        t          j        ||d	
           d{V  	 t,          j                                        }	t3          |	           |	                    |                     |                     | j                            d|	t<          j                             | j         !                    | j                  }
| "                    |	|
          }|j#        }tI          | %                    |                    }tM          j	        |tI          | j'        	                                          gtP                     d{V \  }}| j'        )                                r|*                                 tV          j,        dk    r#| j-        rt]          j/        | j-        d           ntV          j,        dk    rt,          j        0                    dd          }|| j1        vr| j        2                    | d           d}tg          j4        |          }|g|5                    d          z   }| 6                    |to          tp          |                     	 ts          t,          j        0                    d| j:                            }tI          | ;                    |                    }tM          j<        ||           d{V  | j                            d| j         d| d           n^# tL          j=        $ rL | j        2                    d| j         d| d| d           | 6                    |tp          j>                   Y nw xY w| j                             | j        t          j?                   d{V  nt          |          dk    s
J d            |\  }|A                                }| j                            d| j         d|            |dk    r3| j                             | j        t          jB        |           d{V  n{| j         C                    | j                   d{V }||dk    r
d | d!|z   }nd | d"}| j                             | j        t          jD        ||t          jF        #           d{V  n# t          $ r | j        H                    d$t          jJ                                	 | j                             | j        t          jD        t          jJ                    t          jK        %           d{V  n># t          $ r1 | j        H                    d&t          jJ                                Y nw xY wY nw xY wt          jL        M                                 dS # t          jL        M                                 w xY w)'aA  
        Stop and start both happen asynchronously, coordinated by asyncio event
        and coroutine, respectively.

        1) Sets job status as running
        2) Pass runtime env and metadata to subprocess as serialized env
            variables.
        3) Handle concurrent events of driver execution and
        Nz&Status could not be retrieved for job .zJob zN is already in RUNNING state. JobSupervisor.run() should only be called once. z, is not in PENDING state. Current status is z with message zhttp://)driver_agent_http_addressdriver_node_id)jobinfo_replace_kwargsz"Submitting job with RAY_ADDRESS = )return_whenr   ri   r   z7 not a valid stop signal. Terminating job with SIGTERM.T)	recursiveRAY_JOB_STOP_WAIT_TIME_Sz% has been terminated gracefully with z$Attempt to gracefully terminate job z	 through z has timed out after z5 seconds. Job is now being force-killed with SIGKILL.   z#Should have only one coroutine donez% entrypoint command exited with code r   )driver_exit_coderu   z-Job entrypoint command failed with exit code z3, last available logs (truncated to 20,000 chars):
z. No logs available.)messager   
error_typezAGot unexpected exception while trying to execute driver command. )r   r   z2Failed to update job status to FAILED. Exception: )Nr(   get_infor'   RuntimeErrorstatusr   r   RUNNINGPENDINGwaitremoterA   rB   rC   global_workernoder   node_ip_addressdashboard_agent_listen_portrY   get_node_id
put_statusr>   r   copyr   r,   r   r4   inforK   r   r)   get_log_file_pathr   r   create_taskr   r-   r/   r   is_setcancelr}   r~   r0   r   TerminateJobObjectr[   VALID_STOP_SIGNALSwarningr   Processchildrenr   getattrrl   int DEFAULT_RAY_JOB_STOP_WAIT_TIME_Sr   wait_forTimeoutErrorSIGKILLSTOPPEDr   result	SUCCEEDEDget_last_n_log_linesFAILEDr   JOB_ENTRYPOINT_COMMAND_ERROR	Exceptionerror	traceback
format_exc"JOB_ENTRYPOINT_COMMAND_START_ERRORactor
exit_actor)r6   r   rU   	curr_infocurr_statuscurr_messager   r   r   rd   log_pathr   r   polling_taskfinishedr   stop_signaljob_processproc_to_killstop_job_wait_timepoll_job_stop_taskchild_process_taskr   log_tailr   s                            r8   runzJobSupervisor.run5  s      /88FFFFFFFF	WWWWXXX& ()+++Dt| D D D   )+++Pt| P P%0P P@LP P P  
  	4%*11333333333|"05$umD<PRVRr.s.s$u$u!022>>@@#..L-F"0$ $ / 
 
 	
 	
 	
 	
 	
 	
 	
z	#*//##C /s333 JJt001DEEFFFLI}EFI I   '99$,GGH 11#x@@M%)I&t}}]'C'CDDL '{4+;+@+@+B+BCCD+! ! !      KHa
 &&(( N##%%%<7**t/E*/0FKKKK\W,,"$*..1F	"R"RK"$*AAA,,* 0 0 0   '0"(.";";K$/=;3G3GRV3G3W3W#WL ((wv{7S7STTTK-0JNN : $ E . .* .99U9U-V-V*%./ACUVVVVVVVVV))34< 3 3$/3 3 3    #/ K K K,,94< 9 9'29 919 9 9   ,,\6>JJJJJK +66t|YEVWWWWWWWWWW 8}})))+P)))'/$#07799!!64< 6 6(36 6   !##/::!+)4 ;           &*%5%J%J4<%X%XXXXXXXH+BQ5@Q Q Q ''  W5@W W W   /::!( ')4#/#L ;           	 	 	L5%0225 5  +66L$%022+N	 7              "";"+"6"8"8; ;    	& I  """""CI  """"sw   H;W BP, +W ,ARW REW 
Z7 8ZAYZ8ZZZZZ7 ZZ7 7 [c                 8    | j                                          dS )zCSet step_event and let run() handle the rest in its asyncio.wait().N)r/   setrb   s    r8   stopzJobSupervisor.stop  s    r:   rk   )r;   N)F)NF)!r3   
__module____qualname____doc__r   r   r   strr   r   r9   r5   boolr   r_   rc   rX   rz   r{   r   r   r   r   r   r   r   r   rl   Signalsr   r   r  r  ra   r:   r8   r   r   9   s!         ()$""I. #'! !! ! CH~	!
 ! ! 3-! ! ! !<' ' ' '. +0   #' 	c3h       8  \!D \!S \!Z=M \! \! \! \!|
 
c3h 
 
 
 
@CJ,< C C C C CCfn)= C C C Cfn)= FN     6:$)	j# j# &k2j# "	j# j# j# j#X    r:   r   )?r-   r   r1   r>   rl   rz   r}   r   asyncio.tasksr   typingr   r   r   r   rA   ray._private.ray_constantsrB   rK   ray._common.filtersr   ray._common.formattersr	   r
   ray._common.network_utilsr   ray._private.accelerators.npur   $ray._private.accelerators.nvidia_gpur   "ray._private.runtime_env.constantsr   ray._private.utilsr   ray._rayletr   	ray.actorr    ray.dashboard.modules.job.commonr   r   r   0ray.dashboard.modules.job.job_log_storage_clientr   ray.job_submissionr   r   r   r   AttributeErrorensure_futurer~   r   r   r   ModuleNotFoundErrorImportErrorer2   r3   loggerr   r   ra   r:   r8   <module>r      s      				      



     ) ) ) ) ) ) , , , , , , , , , , , , 



 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 ? ? ? ? ? ? ? ? 3 3 3 3 3 3 Q Q Q Q Q Q S S S S S S J J J J J J A A A A A A ! ! ! ! ! ! ! ! ! ! ! !         
 Q P P P P P 6 6 6 6 6 6 6 6 (%KK ( ( ('KKK( <7
- 	
 	
 	
""8,,:67: :	
 	
 	
 	
 	
 	
 	
 	
	
j j j j j j j j j js*   B B,+B,;C D.DD