
    &`in              	          d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlmZmZmZmZmZ ddlZddlmc mZ ddlmZmZ ddlmZmZ ddlmZ dd	lm Z  dd
l!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z< ddl=m>Z>m?Z?m@Z@mAZAmBZB 	 ddlCZCn# eD$ r dZCY nw xY w ejE        eF          ZGd ZHi dddddddi dddd d!d"d#e-d#id$i d%d&i i dd'id(d&d)i d*g d+d,d-g d.g d/g d0g d1g g g g d2ZI G d3 d4          ZJd5e9jK        d6dfd7ZLeFd8k    r ejM        d9:          ZNeNO                    d;d,ePd<=           eNO                    d>d,ePd?=           eNO                    d@d,ePejQ        ejR        ejS        A           eNO                    dBd,ePejT        ejU        C           eNO                    dDd,ePejV        dEejV         dFC           eNO                    dGd#ePdH=           eNO                    dId,eWedJe dKC           eNO                    dLd,eWedMe dNC           eNO                    dOd,ePddPC           eNO                    dQd,ePdRdSC           eNO                    dTd,ePdRdUC           eNX                                ZYejZ        dVk    reYj[        ndZ\ejZ        dVk    reYj]        ndWZ^ e"eYj_        eYj`        eYja        eYjb        e\e^X            ejc        eYjd        eYje        e\e^           eGf                    dYejg                    eGf                    dZejh                    eGf                    d[eji                    eGf                    d\ejj                    eYjk        r ejl        m                    eYjk                  ZkndZkeYjn        Zoeo epd]           eJeoekeYja        eYjq        ^          Zrers                                 dS dS )_z"Autoscaler monitoring loop daemon.    N)Counter)asdict)AnyCallableDictOptionalUnion)build_addressparse_address)LOGGING_ROTATE_BACKUP_COUNTLOGGING_ROTATE_BYTES)logging_utils)get_event_logger)setup_component_logger)	GcsClient)StandardAutoscaler)teardown_cluster)*AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZEAUTOSCALER_METRIC_PORTAUTOSCALER_UPDATE_INTERVAL_SDISABLE_LAUNCH_CONFIG_CHECK_KEY)EventSummarizer)LoadMetrics)AutoscalerPrometheusMetrics)format_readonly_node_type)get_cluster_resource_state)gcs_pb2)Event)_initialize_internal_kv_internal_kv_del_internal_kv_get_internal_kv_initialized_internal_kv_putc                 "   g g }}	 t          | j                  D ]}t          |j                  }t	          |j                  D ]}|                    |           t	          |j                  D ]}|                    |           |j        dk    r|}n|}t	          |j                  D ]}|                    |           t          ||z             t          k    r nn*# t          $ r t                              d           Y nw xY w||fS )a4  Handle the message.resource_load_by_shape protobuf for the demand
    based autoscaling. Catch and log all exceptions so this doesn't
    interfere with the utilization based autoscaler until we're confident
    this is stable. Worker queue backlogs are added to the appropriate
    resource demand vector.

    Args:
        resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
            in protobuf form or None.

    Returns:
        List[ResourceDict]: Waiting bundles (ready and feasible).
        List[ResourceDict]: Infeasible bundles.
    r   z!Failed to parse resource demands.)listresource_demandsdictshaperangenum_ready_requests_queuedappendnum_infeasible_requests_queuedbacklog_sizelenr   	Exceptionlogger	exception)resource_load_by_shapewaiting_bundlesinfeasible_bundlesresource_demand_pbrequest_shape_backlog_queues          s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.pyparse_resource_demandsr:   :   sc    +-b'O>"&'='N"O"O 	 	 !3!9::M-GHH 6 6&&}5555-LMM 9 9"))-8888 "@1DD 2 /-:;; 4 4$$]3333O&8899<= = =  > > ><=====> ...s   CC# #$D
	D
cluster_namedefaultmax_workersupscaling_speedg      ?dockeridle_timeout_minutesprovidertypereadonlyuse_node_id_as_ipTauthavailable_node_typeszray.head.default	resourcesnode_configr=   head_node_typefile_mountscluster_synced_filesfile_mounts_sync_continuouslyFrsync_excludersync_filterinitialization_commandssetup_commandshead_setup_commands)worker_setup_commandshead_start_ray_commandsworker_start_ray_commandsc                       e Zd ZdZ	 	 	 	 ddedeeeg eeef         f         f         dede	d	e
e         d
e	fdZd Zd Zdede
e         fdZd Zd Zd Zd Zd Zd Zd Zd ZdS )MonitorzyAutoscaling monitor.

    This process periodically collects stats from the GCS and triggers
    autoscaler updates.
    NFTaddressautoscaling_configlog_dirprefix_cluster_info
monitor_ipretry_on_failurec                 f   || _         t          j        j        j        }t          | j                   | _        t          | j                   |rDt          |t                    }| j        
                    d|                                dd            |                     | j                  | _        t                              d| j                    d|_        t#          | j                   d         }	t%                      | _        d | _        t+                      | _        || _        || _        || _        d | _        d | _        |r=	 t9          t:          j        j        |          | _         n# tB          $ r
 d | _         Y nw xY wd | _         tE          | j                  | _#        |rtH          r	 t                              d%                    t                               |	dk    rd	dini }
tI          j&        dt          | j#        j'        d
|
 | j#        j(        )                                 | j#        j*        )                                 nK# tB          $ r t          +                    d           Y n%w xY wtH          st          ,                    d           t                              d           d S )N)rX   s   AutoscalerMetricsAddressTzsession_name: r   )session_namez-Starting autoscaler metrics server on port {}z	127.0.0.1addr)portregistryz8An exception occurred while starting the metrics server.z?`prometheus_client` not found, so metrics will not be exported.zMonitor: Started )-gcs_addressray_privateworkerglobal_workerr   
gcs_clientr   r
   r   internal_kv_putencodeget_session_name_session_namer0   infomoder   r   load_metricslast_avail_resourcesr   event_summarizerr[   r]   rY   
autoscalerreadonly_configr   RayEvent
SourceType
AUTOSCALERevent_loggerr/   r   prom_metricsprometheus_clientformatstart_http_serverrb   pending_nodesclearactive_nodesr1   warning)selfrX   rY   rZ   r[   r\   r]   rg   monitor_addrhead_node_ipkwargss              r9   __init__zMonitor.__init__   s    #$2#D,<===000 	(5KLLLO+++\-@-@-B-BD$   "224?CC9T%799:::$T%566q9'MM$(! / 1 1#6  0"4  $ 	%)$4'2G% %!!  ) ) )$(!!!) !%D7TEWXXX 	+ 	CJJ.   
 3?+2M2M&+..SU!3 /!.7     !/55777!.446666     N     # 	NNQ   	&'''''s%   $E, ,F ?F .BI $I32I3c           	            j         r j         }nt           _         fd}|}t          | j         j         j         j         j         j	                   _
        d S )Nc                       j         S N)rt   r   s   r9   get_latest_readonly_configzBMonitor._initialize_autoscaler.<locals>.get_latest_readonly_config   s    ++    )r[   rr   ry   )rY   BASE_READONLY_CONFIGrt   r   rp   ri   rm   r[   rr   ry   rs   )r   rY   r   s   `  r9   _initialize_autoscalerzMonitor._initialize_autoscaler   s    " 	<!%!8 $8D , , , , , "<,O $ 8!2*
 
 
r   c                    | j                             d          }|j        }t          |           t	          | j                   }|j        }d |D             }| j        rmg }t          |j                  D ]7}|j        	                                }|
                    ||j        f           8| j        j                            |           i }	|j        D ]}
|
j        }| j        rPt          |	                                          }i }|
j                                        D ]
\  }}|||<   |i dd|	|<   t%          |
j                  }t%          |
j                  }t)          |j                  \  }}t          |j        j                  }| j        duo%| j        j        d                             dd	          }|rI|                    d
          }|t5          t7          |                    }n|	                                }n|
j        }d}||v r||         dz  }nt8                              d| d           | j                            ||||||||           | j        r"| j        d                             |	           dS dS )z>Fetches resource usage data from GCS and updates load metrics.<   timeoutc                 (    i | ]}|j         |j        S rc   )node_ididle_duration_ms).0nodes     r9   
<dictcomp>z/Monitor.update_load_metrics.<locals>.<dictcomp>  s,     ,
 ,
 ,
48DL$/,
 ,
 ,
r      rG   NrA   rD   FNODE_ID_AS_RESOURCEg        i  znode_id z. not found in ray_nodes_idle_duration_ms_by_idrF   ) ri   get_all_resource_usageresource_usage_data"log_resource_batch_data_if_desiredr   node_statesrt   r%   r   hexr+   node_ip_addressrs   rA   
_set_nodesr   total_resourcesitemsr'   available_resourcesr:   r2   placement_group_loadplacement_group_dataconfiggetstrintr0   r   rp   update)r   responseresources_batch_datacluster_resource_stateray_node_states ray_nodes_idle_duration_ms_by_id	new_nodesmsgr   mirror_node_typesresource_message	node_typerH   kvr   r   r3   r4   pending_placement_groupsrD   
peloton_idipidle_duration_ss                           r9   update_load_metricszMonitor.update_load_metrics   s    ?99"9EE';*+?@@@
 "<DO!L!L0<,
 ,
<K,
 ,
 ,
(
  	;I2>?? A A+//++  '3+>!?@@@@O$//	::: 6 B <	 <	&.G# 
5gkkmmDD		,<BBDD % %DAq#$IaLL!*#%#$0 0!),
 ##3#CDDO"&'7'K"L"L2H$;3 3/O/ (,$9N( ($ !%t ; !.@VAc%u--  ! 6,001FGG
)S__--BB BB%5!O:::"B7"Kd"RVwVVV   $$#"(	 	 	 	  	S !78??@QRRRRR	S 	Sr   ri   returnc                     t                      sdS |                    dt          j        d          }|r|                                }|S )zObtain the session name from the GCS.

        If the GCS doesn't respond, session name is considered None.
        In this case, the metrics reported from the monitor won't have
        the correct session name.
        Ns   session_name
   r   )r"   internal_kv_getray_constantsKV_NAMESPACE_SESSIONdecode)r   ri   r_   s      r9   rl   zMonitor.get_session_nameO  s^     ()) 	4!11. 2 
 
  	1'..00Lr   c                 (   t                      sdS t          t          j        j        j                  }|r\	 t          j        |          }| j        	                    |           dS # t          $ r t                              d           Y dS w xY wdS )z@Fetches resource requests from the internal KV and updates load.NzError parsing resource requests)r"   r!   re   rf   r   #AUTOSCALER_RESOURCE_REQUEST_CHANNELjsonloadsrp   set_resource_requestsr/   r0   r1   )r   dataresource_requests      r9   update_resource_requestsz Monitor.update_resource_requestsd  s    ')) 	FL&J
 
  	DD#':d#3#3 !778HIIIII D D D  !BCCCCCCD		D 	Ds   .A' '$BBc                    	 	 t          j                     }|                                  t          j                     |z
  }|                                  |                                  | j                                        }|t          j                     t          j                    d}| j        r#| j        st          
                    d           n| j        ryt          j                     }| j                                         t          j                     |z
  |d<   | j                                        }	 |                     ||| j        j                   n*# t          $ r t                              d           Y nw xY w|r&t!          |          |d<   | j        j        j        |d<   | j                                        D ]s}|                    d          D ][}t          
                    d	                    t,          j        |                     | j        r| j        
                    |           \t| j                                         t!          |          |d
<   t5          j        |          }	t9                      rt;          t,          j        |	d           n3# t          $ r& | j        rt                              d           n Y nw xY wt          j         tB                     )zRun the monitor loop.T)gcs_request_timetimemonitor_pidz6Autoscaler has not yet received load metrics. Waiting.autoscaler_update_timezError emitting metricsautoscaler_reportnon_terminated_nodes_time
z{}{}load_metrics_report	overwritez-Monitor: Execution exception. Trying again...)"r   r   r   update_event_summaryrp   summaryosgetpidrs   r0   rn   r   emit_metricsall_node_typesr/   r1   r   non_terminated_nodesr   rr   splitr{   r   LOG_PREFIX_EVENT_SUMMARYrx   r~   r   dumpsr"   r#   DEBUG_AUTOSCALING_STATUSr]   sleepr   )
r   gcs_request_start_timer   load_metrics_summarystatusupdate_start_timeautoscaler_summaryr   lineas_jsons
             r9   _runzMonitor._runr  s/   K	5F)-&((***#'9;;1G#G --///))+++'+'8'@'@'B'B$(8 IKK#%9;;  ? ,24+< ,2 KKP    _ #2(,	%O**,,,7;y{{EV7VF34)-)@)@)B)B&C))0. O:   
 % C C C(()ABBBBBC * 6<=O6P6P23 !O@Z 7  $4<<>> 
= 
= %(IIdOO = =D"KK &$1$JD!" !"  
  $0 = $ 1 6 6t < < <= )//111067K0L0L,-*V,,+-- $%>SW       ( $$%TUUUU VU J3444WK	5s7   D6J% :!E J% $F J% FD!J% %-KKc                    |d S dD ]}|j                             |d          \  }}|j                            |d          }| j        j                            || j        j                                      |           | j        j                            || j        j                                      |           t                      }|j	        D ]\  }}	}||	xx         dz  cc<   |j
                                        D ]\  }	}
||	xx         |
z  cc<   |D ]H}	||	         }
| j        j	                            | j        j        |	                              |
           I|D ][}	|j                            |	d          }
| j        j                            | j        j        |	                              |
           \t                      }|j        D ]\  }}	||	xx         dz  cc<   |                                D ]C\  }	}
| j        j                            | j        j        |	                              |
           Dd S )N)CPUGPUTPU)r   r   r   )resourceSessionNamer   )r   NodeType)usager   pending_resourcesry   cluster_resourceslabelsr_   setr   r}   pending_launchesr   r   failed_nodesrecently_failed_nodes)r   r   r   
node_typesresource_namer7   totalpendingpending_node_countr   countfailed_node_countss               r9   r   zMonitor.emit_metrics  s   %42 
	 
	M+155mVLLHAu(:>>}aPPG/66& -: 7   c%jjj/66& -: 7   c'llll$YY1? 	/ 	/OAy!y)))Q.)))) 2 C I I K K 	3 	3Iuy)))U2))))# 	 	I&y1E+22 -:" 3   c%jjjj# 	 	I&377	1EEE*11 -:" 2   c%jjjj$YY.; 	/ 	/LAyy)))Q.)))) !3 8 8 : : 	 	Iu3:: -:" ;   c%jjjj		 	r   c                     | j                                         }| j        s2|| j        k    r)| j                            d|d            || _        dS dS dS )a  Report the current size of the cluster.

        To avoid log spam, only cluster size changes (CPU, GPU or TPU count change)
        are reported to the event summarizer. The event summarizer will report
        only the latest cluster size per batch.
        zResized to {}.c                     |S r   rc   )oldnews     r9   <lambda>z.Monitor.update_event_summary.<locals>.<lambda>  s    3 r   )quantity	aggregateN)rp   resources_avail_summaryrt   rq   rr   add)r   avail_resourcess     r9   r   zMonitor.update_event_summary  s|     +CCEE# 	84;T(T(T!%% (.. &   
 )8D%%%	8 	8(T(Tr   c                    | j         dS | j        t                              d           dS t                              d           d}|sy	 t          | j        dddd           d}t                              d           n># t          $ r1 t                              d           t          j        d	           Y nw xY w|wdS dS )
a  Cleanup the autoscaler, in case of an exception in the run() method.

        We kill the worker nodes, but retain the head node in order to keep
        logs around, keeping costs minimal. This monitor process runs on the
        head node anyway, so this is more reliable.Nz9Monitor: Cleanup failed due to lack of autoscaler config.z1Monitor: Exception caught. Taking down workers...FT)config_fileyesworkers_onlyoverride_cluster_namekeep_min_workerszMonitor: Workers taken down.z+Monitor: Cleanup exception. Trying again...   )	rs   rY   r0   errorrn   r   r/   r   r   )r   cleans     r9   destroy_autoscaler_workersz"Monitor.destroy_autoscaler_workers  s     ?"F"*LLTUUUFGHHH 	  $ 7!%*.%)    :;;;;   JKKK
1  	 	 	 	 	s   5B 8B=<B=c                 Z   | j         Qt          j                            dd          dk    r-| j                                          |                                  d| }t                      rt          t          j	        |d           ddl
m}  |t          j        || j        	           d S )
N RAY_AUTOSCALER_FATESHARE_WORKERS 1z0The autoscaler failed with the following error:
Tr   r   )publish_error_to_driver)ri   )rs   r   environr   kill_workersr  r"   r#   r   DEBUG_AUTOSCALING_ERRORray._private.utilsr  MONITOR_DIED_ERRORri   )r   r  messager  s       r9   _handle_failurezMonitor._handle_failure%  s    O'
A2FF#MMO((***++--- NeMM#%% 	5w$    	?>>>>>,	
 	
 	
 	
 	
 	
r   c                    	 |                      d| dd                    t          j        |                    z              n*# t          $ r t
                              d           Y nw xY wt          j        |dz              d S )NzTerminated with signal r   r  z#Monitor: Failure in signal handler.   )	r  join	tracebackformat_stackr/   r0   r1   sysexit)r   sigframes      r9   _signal_handlerzMonitor._signal_handler=  s    	D  1#111'')077889     	D 	D 	DBCCCCC	Dss   AA $A+*A+c                    t          j         t           j        | j                   t          j         t           j        | j                   	 t	                      rt          t          j                   |                                  | 	                                 d S # t          $ rB t                              d           |                     t          j                                w xY w)NzError in monitor loop)signalSIGINTr)  SIGTERMr"   r    r   r  r   r   r/   r0   r1   r  r#  
format_excr   s    r9   runzMonitor.runG  s     	fmT%9:::fnd&:;;;		')) H !FGGG'')))IIKKKKK 	 	 	4555  !5!7!7888	s   
AB AC')NFNT)__name__
__module____qualname____doc__r   r	   r   r   r   boolr   r   r   r   r   rl   r   r   r   r   r  r  r)  r/  rc   r   r9   rW   rW      sw         $)$(!%O( O(O( "#xDcN0B'C"CDO( 	O(
 "O( SMO( O( O( O( O(b
 
 
.ZS ZS ZSx9 #    *D D DN5 N5 N5`1 1 1f8 8 8   @
 
 
0      r   rW   r   r   c                     t          j        d          dk    rPt                              d           t                              |            t                              d           d S d S )N"AUTOSCALER_LOG_RESOURCE_BATCH_DATAr  z-Logging raw resource message pulled from GCS.z"Done logging raw resource message.)r   getenvr0   rn   )r   s    r9   r   r   X  sa     
y566#==CDDD()))899999 >=r   __main__z/Parse GCS server for the monitor to connect to.)descriptionz--gcs-addresszThe address (ip:port) of GCS.)requiredrB   helpz--autoscaling-configz'the path to the autoscaling config filez--logging-level)r:  rB   r<   choicesr;  z--logging-format)r:  rB   r<   r;  z--logging-filenamezFSpecify the name of log file, log to stdout if set empty, default is ""z
--logs-dirzBSpecify the path of the temporary directory used by Ray processes.z--logging-rotate-bytesz8Specify the max bytes for rotating log file, default is z bytes.z--logging-rotate-backup-countz9Specify the backup count of rotated log file, default is .z--monitor-ipz:The IP address of the machine hosting the monitor process.z--stdout-filepathr  z$The filepath to dump monitor stdout.z--stderr-filepathz$The filepath to dump monitor stderr.win32r   )logging_levellogging_formatrZ   filename	max_bytesbackup_countz)Starting monitor using ray installation: zRay version: zRay commit: zMonitor started with command: z--gcs-address must be set!)rZ   r\   )tr3  argparser   loggingr   r+  r%  r   r#  collectionsr   dataclassesr   typingr   r   r   r   r	   re   ray._private.ray_constantsrf   r   ray._common.network_utilsr
   r   ray._common.ray_constantsr   r   ray._privater   ray._private.event.event_loggerr   ray._private.ray_loggingr   ray._rayletr   "ray.autoscaler._private.autoscalerr    ray.autoscaler._private.commandsr   !ray.autoscaler._private.constantsr   r   r   r   (ray.autoscaler._private.event_summarizerr   $ray.autoscaler._private.load_metricsr   $ray.autoscaler._private.prom_metricsr   ray.autoscaler._private.utilr   ray.autoscaler.v2.sdkr   ray.core.generatedr   ray.core.generated.event_pb2r   ru   ray.experimental.internal_kvr   r    r!   r"   r#   rz   ImportError	getLoggerr0  r0   r:   r   rW   ResourceUsageBatchDatar   ArgumentParserparseradd_argumentr   LOGGER_LEVELLOGGER_LEVEL_CHOICESLOGGER_LEVEL_HELPLOGGER_FORMATLOGGER_FORMAT_HELPMONITOR_LOG_FILE_NAMEr   
parse_argsargsplatformlogging_rotate_byteslogging_rotation_byteslogging_rotate_backup_countlogging_rotation_backup_countr@  rA  logs_dirlogging_filename redirect_stdout_stderr_if_neededstdout_filepathstderr_filepathrn   __file____version__
__commit__argvrY   path
expanduserrd   bootstrap_address
ValueErrorr\   monitorr/  rc   r   r9   <module>r}     sa   ( (    				  



                  7 7 7 7 7 7 7 7 7 7 7 7 7 7 



 2 2 2 2 2 2 2 2 2 B B B B B B B B        ' & & & & & < < < < < < ; ; ; ; ; ; ! ! ! ! ! ! A A A A A A = = = = = =            E D D D D D < < < < < < L L L L L L B B B B B B < < < < < < & & & & & & : : : : : :                 
	8	$	$(/ (/ (/XI1 s b	
 A 
T' B "RPQRR (  2!" B#$ $U%& R'( B)* r+, b-. 2/0  !!#5   <Q Q Q Q Q Q Q Qh:!8:	: : : : z$X$F  F %c8W     6	     *2,     +-     33/3 3 3     Q	      $)) ) )     '+*&* * *     I     3     3     D ;>,':Q:QT66WX,/LG,C,C(( " (*&(2    3M2%	   KKJCLJJKKK
KK111222
KK/s~//000
KK;;;<<< "W//0GHH!( j5666g?	  G KKMMMMM s   C CC