
    &`i)O                     p   d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
mZ  ej        e          ZdZeZeZeZ G d dej                  Z G d de
          Z G d	 d
e
          Z G d de
          Z G d dej                  Z G d de          Z G d de          Z G d d          ZdS )zGPU providers for monitoring GPU usage in Ray dashboard.

This module provides an object-oriented interface for different GPU providers
(NVIDIA, AMD) to collect GPU utilization information.
    N)DictListOptional	TypedDictUnioni   c                       e Zd ZdZdZdZdS )GpuProviderTypezEnum for GPU provider types.nvidiaamdN)__name__
__module____qualname____doc__NVIDIAAMD     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/dashboard/modules/reporter/gpu_providers.pyr	   r	      s        &&F
CCCr   r	   c                   >    e Zd ZU dZeed<   eed<   ee         ed<   dS )ProcessGPUInfoz1Information about GPU usage for a single process.pidgpu_memory_usagegpu_utilizationN)	r   r   r   r   int__annotations__	Megabytesr   
Percentager   r   r   r   r       s>         ;;	HHHj))))))r   r   c                       e Zd ZU dZeed<   eed<   eed<   ee         ed<   e	ed<   e	ed<   ee
eef                  ed<   d	S )
GpuUtilizationInfoz4GPU utilization information for a single GPU device.indexnameuuidutilization_gpumemory_usedmemory_totalprocesses_pidsN)r   r   r   r   r   r   strr   r   r   r   r   r   r   r   r   r   (   sw         >>JJJ
III
IIIj))))T#~"56777777r   r   c                   j    e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   d
S )TpuUtilizationInfor    r!   tpu_typetpu_topologytensorcore_utilizationhbm_utilization
duty_cycler$   r%   N)r   r   r   r   r   r'   r   Bytesr   r   r   r)   r)   5   su         JJJ
IIIMMM&&&&r   r)   c                      e Zd ZdZd Zej        defd            Zej        de	fd            Z
ej        de	fd            Zej        d             Zej        dee         fd            Zed	eeef         defd
            ZdS )GpuProviderz&Abstract base class for GPU providers.c                     d| _         d S )NF)_initializedselfs    r   __init__zGpuProvider.__init__D   s    !r   returnc                     dS )z$Return the type of the GPU provider.Nr   r4   s    r   get_provider_namezGpuProvider.get_provider_nameG   	     	r   c                     dS )z6Check if the GPU provider is available on this system.Nr   r4   s    r   is_availablezGpuProvider.is_availableL   r:   r   c                     dS )z8Initialize the GPU provider. Returns True if successful.Nr   r4   s    r   _initializezGpuProvider._initializeQ   r:   r   c                     dS )z1Shutdown the GPU provider and clean up resources.Nr   r4   s    r   	_shutdownzGpuProvider._shutdownV   r:   r   c                     dS )z7Get GPU utilization information for all available GPUs.Nr   r4   s    r   get_gpu_utilizationzGpuProvider.get_gpu_utilization[   r:   r   bc                 Z    t          | t                    r|                     d          S | S )z2Decode bytes to string for Python 3 compatibility.zutf-8)
isinstancebytesdecode)rC   s    r   _decodezGpuProvider._decode`   s-     a 	%88G$$$r   N)r   r   r   r   r6   abcabstractmethodr	   r9   boolr<   r>   r@   r   r   rB   staticmethodr   r'   rF   rH   r   r   r   r1   r1   A   s1       00" " " 	?     	d     	T     	   	T*<%=     5e$     \  r   r1   c                        e Zd ZdZ fdZdefdZdefdZdefdZ	d Z
dee         fdZdee         fd	Zd
edee         fdZd
ededee         fdZd
edee         fdZ xZS )NvidiaGpuProviderz!NVIDIA GPU provider using pynvml.c                 d    t                                                       d | _        i | _        d S N)superr6   _pynvml_gpu_process_last_sample_tsr5   	__class__s    r   r6   zNvidiaGpuProvider.__init__k   s.    ;=(((r   r7   c                     t           j        S rP   )r	   r   r4   s    r   r9   z#NvidiaGpuProvider.get_provider_nameq   s    %%r   c                     	 ddl mc mc m} |                                 |                                 dS # t          $ r(}t                              d|            Y d}~dS d}~ww xY w)z#Check if NVIDIA GPUs are available.r   NTzNVIDIA GPU not available: F)	ray._private.thirdparty.pynvml_private
thirdpartypynvmlnvmlInitnvmlShutdown	Exceptionloggerdebugr5   r[   es      r   r<   zNvidiaGpuProvider.is_availablet   s    	;;;;;;;;;;;;OO!!!4 	 	 	LL9a99:::55555	   48 
A*A%%A*c                     | j         rdS 	 ddlmc mc m} || _        | j                                         d| _         dS # t          $ r(}t          	                    d|            Y d}~dS d}~ww xY w)z#Initialize the NVIDIA GPU provider.Tr   Nz*Failed to initialize NVIDIA GPU provider: F)
r3   rX   rY   rZ   r[   rR   r\   r^   r_   r`   ra   s      r   r>   zNvidiaGpuProvider._initialize   s     	4		;;;;;;;;;;;;!DLL!!### $D4 	 	 	LLIaIIJJJ55555	   3A   
A2
A--A2c                     | j         rk| j        rf	 | j                                         n4# t          $ r'}t                              d|            Y d}~nd}~ww xY wd| _         dS # d| _         w xY wdS dS )z!Shutdown the NVIDIA GPU provider.z)Error shutting down NVIDIA GPU provider: NF)r3   rR   r]   r^   r_   r`   r5   rb   s     r   r@   zNvidiaGpuProvider._shutdown   s     	* 	**))++++ N N NLLLMMMMMMMMN %*!!!E!))))	* 	* 	* 	*+   * A' 
AAA' AA' '	A0c                 *    |                                  S )zDGet GPU utilization information for all NVIDIA GPUs and MIG devices.)_get_pynvml_gpu_usager4   s    r   rB   z%NvidiaGpuProvider.get_gpu_utilization   s     ))+++r   c                    | j         s|                                 sg S g }	 | j                                        }t	          |          D ]}| j                            |          }	 | j                            |          }|d         r,|                     ||          }|                    |           kn# | j        j	        t          f$ r Y nw xY w|                     ||          }|r|                    |           n4# t          $ r'}t                              d|            Y d }~nd }~ww xY w|                                  n# |                                  w xY w|S )Nr   z&Error getting NVIDIA GPU utilization: )r3   r>   rR   nvmlDeviceGetCountrangenvmlDeviceGetHandleByIndexnvmlDeviceGetMigMode_get_mig_devicesextend	NVMLErrorAttributeError_get_gpu_infoappendr^   r_   warningr@   )	r5   gpu_utilizationsnum_gpusi
gpu_handlemig_modemig_devicesgpu_inforb   s	            r   rj   z'NvidiaGpuProvider._get_pynvml_gpu_usage   s     	##%% 		|6688H8__ 6 6!\DDQGG
	#|@@LLH{ !&*&;&;J&J&J(//<<< 	!
 .?   D
  --j!<< 6$++H555%6(  	I 	I 	INNGAGGHHHHHHHH	I NNDNNsU   AD  &AB53D  5CD  C1D  ?E	  
D1
D,'E	 ,D11E	 	E	gpu_indexc                    g }	 | j                             |          }t          |          D ]a}	 | j                             ||          }|                     |||          }|r|                    |           M# | j         j        $ r Y ^w xY wn@# | j         j        t          f$ r'}t          	                    d|            Y d}~nd}~ww xY w|S )z6Get MIG device information for a GPU with MIG enabled.zError getting MIG devices: N)
rR   nvmlDeviceGetMaxMigDeviceCountrm   #nvmlDeviceGetMigDeviceHandleByIndex_get_mig_device_inforu   rr   rs   r_   r`   )	r5   rz   r~   r|   	mig_countmig_idx
mig_handlemig_inforb   s	            r   rp   z"NvidiaGpuProvider._get_mig_devices   s   	<CCJOOI ++  !%!Q!Q"G" "J
  $88YPWXXH 5#**8444|-   H  &7 	< 	< 	<LL:q::;;;;;;;;	< s;   +B A	A:9B :B	B BB C&CC	mig_indexc           
         	 | j                             |          }d}	 | j                             |          }t          |j                  }n9# | j         j        $ r'}t                              d|            Y d}~nd}~ww xY wi }	 | j                             |          }	| j         	                    |          }
|	|
z   D ]^}t          t          |j                  |j        rt          |j                  t          z  ndd          |t          |j                  <   _n9# | j         j        $ r'}t                              d|            Y d}~nd}~ww xY w	 |                     | j                             |                    }|                     | j                             |                    }n# | j         j        $ ry 	 |                     | j                             | j                             |                              }| d| }d| d	| }n # t$          $ r d
| d| }d| d	| }Y nw xY wY nw xY wt'          |dz  |z   |||t          |j                  t          z  t          |j                  t          z  |          S # t$          $ r(}t                              d|            Y d}~dS d}~ww xY w)z-Get utilization info for a single MIG device.z+Failed to retrieve MIG device utilization: Nr   r   r   r   z)Failed to retrieve MIG device processes: z MIG zMIG-GPU--zNVIDIA MIG Device .zMIG-  r    r!   r"   r#   r$   r%   r&   zError getting MIG device info: )rR   nvmlDeviceGetMemoryInfonvmlDeviceGetUtilizationRatesr   gpurr   r_   r`   $nvmlDeviceGetComputeRunningProcesses%nvmlDeviceGetGraphicsRunningProcessesr   r   usedGpuMemoryMBrH   nvmlDeviceGetUUIDnvmlDeviceGetNamern   r^   r   usedtotal)r5   r   r~   r   memory_infoutilizationutilization_inforb   r&   nv_comp_processesnv_graphics_processes
nv_processmig_uuidmig_nameparent_names                  r   r   z&NvidiaGpuProvider._get_mig_device_info   s|   C	,>>zJJK KP#'<#M#M$ $  ""2"677<) P P PN1NNOOOOOOOOP  NN$(L$U$U% %! LFFzRR & #46K"K 
 
J:H
//  *7#C
 899R??!" )-	; 	; 	;N3z~#6#677
 <) N N NLLLMMMMMMMMN><<(F(Fz(R(RSS<<(F(Fz(R(RSS<) > > >
>"&,,66 LCCINN # #K
 #.??I??HA)AAiAAHH  > > >KIKK	KKH=i==)==HHH>> &$&2 + 011R7 !233r9-     	 	 	LL>1>>???44444	s   J .A J BA?:J ?BJ 
BD# "J #E2EJ EJ AF8 7J 8I AHI H:7I 9H::I =J ?I  AJ 
KK  Kc           
         	 | j                             |          }d}	 | j                             |          }t          |j                  }n9# | j         j        $ r'}t                              d|            Y d}~nd}~ww xY wi }	 t          t          j                    dz            }| j	        
                    |d          }	| j                             ||	          }
|| j	        |<   |
D ]}t          t          |j                  t          |j                  dz  t          |j                  z  t           z  t          |j                            |t          |j                  <   n# | j         j        $ r}t                              d|            	 | j                             |          }| j                             |          }||z   D ]^}t          t          |j                  |j        rt          |j                  t           z  ndd          |t          |j                  <   _n9# | j         j        $ r'}t                              d	|            Y d}~nd}~ww xY wY d}~nd}~ww xY wt+          ||                     | j                             |                    |                     | j                             |                    |t          |j                  t           z  t          |j                  t           z  |
          S # t4          $ r(}t                              d|            Y d}~dS d}~ww xY w)z1Get utilization info for a regular (non-MIG) GPU.r   z$Failed to retrieve GPU utilization: Nr   r   d   r   zFailed to retrieve GPU processes using `nvmlDeviceGetProcessesUtilizationInfo`, fallback to `nvmlDeviceGetComputeRunningProcesses` and `nvmlDeviceGetGraphicsRunningProcesses`: z{Failed to retrieve GPU processes using `nvmlDeviceGetComputeRunningProcesses` and `nvmlDeviceGetGraphicsRunningProcesses`: r   zError getting GPU info: )rR   r   r   r   r   rr   r_   r`   timerS   get%nvmlDeviceGetProcessesUtilizationInfor   r   memUtilr   r   smUtilr   r   r   r   rH   r   r   r   r^   )r5   rz   r~   r   r   r   rb   r&   current_ts_ms
last_ts_msnv_processesr   r   r   
fallback_es                  r   rt   zNvidiaGpuProvider._get_gpu_info*  s   J	,>>zJJK KI#'<#M#M$ $  ""2"677<) I I IGAGGHHHHHHHHI  N- #DIKK$$6 7 7!=AA)QOO
#|QQ
    ?L0;".  J:H
//),Z-?)@)@*k/00*1 * ),J,=(>(>; ; ;N3z~#6#677 <)    K  HI  K  K  II*UU & JJ:VV * '8:O&O 	 	
>L #JN 3 3 $.#;!'J$< = = C C%&,0? ? ?s:>':':;;	 |-   LL c  Wa  c  c       /8 &\\$,"@"@"L"LMM\\$,"@"@"L"LMM + 011R7 !233r9-     	 	 	LL7A7788844444	s   L .A L BA?:L ?BL 
C%E1 /L 1I: I5BH76I57I-I(#I5(I--I50L 5I::B"L 
M'M

M)r   r   r   r   r6   r	   r9   rK   r<   r>   r@   r   r   rB   rj   r   rp   r   r   rt   __classcell__rU   s   @r   rN   rN   h   su       ++> > > > >&? & & & &
d 
 
 
 
T     * * *,T*<%= , , , ,
# t,>'? #  #  #  # Jc dCU>V    :G%(G58G	$	%G G G GRL3 L8DV;W L L L L L L L Lr   rN   c                   d     e Zd ZdZ fdZdefdZdefdZdefdZ	d Z
dee         fdZ xZS )	AmdGpuProviderz AMD GPU provider using pyamdsmi.c                 V    t                                                       d | _        d S rP   )rQ   r6   	_pyamdsmirT   s    r   r6   zAmdGpuProvider.__init__|  s$    r   r7   c                     t           j        S rP   )r	   r   r4   s    r   r9   z AmdGpuProvider.get_provider_name  s    ""r   c                     	 ddl mc mc m} |                                 |                                 dS # t          $ r(}t                              d|            Y d}~dS d}~ww xY w)z Check if AMD GPUs are available.r   NTzAMD GPU not available: F)	 ray._private.thirdparty.pyamdsmirY   rZ   pyamdsmismi_initializesmi_shutdownr^   r_   r`   r5   r   rb   s      r   r<   zAmdGpuProvider.is_available  s    	????????????##%%%!!###4 	 	 	LL616677755555	rc   c                     | j         rdS 	 ddlmc mc m} || _        | j                                         d| _         dS # t          $ r(}t          	                    d|            Y d}~dS d}~ww xY w)z Initialize the AMD GPU provider.Tr   Nz'Failed to initialize AMD GPU provider: F)
r3   r   rY   rZ   r   r   r   r^   r_   r`   r   s      r   r>   zAmdGpuProvider._initialize  s     	4		????????????%DNN))+++ $D4 	 	 	LLF1FFGGG55555	re   c                     | j         rk| j        rf	 | j                                         n4# t          $ r'}t                              d|            Y d}~nd}~ww xY wd| _         dS # d| _         w xY wdS dS )zShutdown the AMD GPU provider.z&Error shutting down AMD GPU provider: NF)r3   r   r   r^   r_   r`   rg   s     r   r@   zAmdGpuProvider._shutdown  s     	* 	**++---- K K KIaIIJJJJJJJJK %*!!!E!))))	* 	* 	* 	*rh   c                 z   | j         s|                                 sg S g }	 | j                                        }| j                                        }t          |          D ]t}| j                            |          }|dk    rd}i }| j                            ||          D ]\}|j        rSt          t          |j                  t          |j                  t          z  d          |t          |j                  <   ]t          ||                     | j                            |                    t!          | j                            |                    |t          | j                            |                    t          z  t          | j                            |                    t          z  |          }|                    |           vn4# t*          $ r'}	t,                              d|	            Y d}	~	nd}	~	ww xY w|                                  n# |                                  w xY w|S )z1Get GPU utilization information for all AMD GPUs.r   Nr   r   z#Error getting AMD GPU utilization: )r3   r>   r   smi_get_device_countsmi_get_device_compute_processrm   smi_get_device_utilization&smi_get_compute_process_info_by_device
vram_usager   r   
process_idr   r   rH   smi_get_device_namehexsmi_get_device_unique_idsmi_get_device_memory_usedsmi_get_device_memory_totalru   r^   r_   rv   r@   )
r5   rw   rx   	processesry   r   r&   processinforb   s
             r   rB   z"AmdGpuProvider.get_gpu_utilization  sS     	##%% 	$	~::<<HEEGGI8__ . ."nGGJJ"$$"$K "$#~TTy     G ) BP #G$6 7 7-01C-D-D-J,0C C Cs7+='>'>? *dn&H&H&K&KLLT^DDQGGHH$/ #DN$M$Ma$P$P Q QUW W!$T^%O%OPQ%R%R!S!S"#1	 	 	 !''----7.:  	F 	F 	FNNDDDEEEEEEEE	F NNDNNs0   F7G H" 
H
#H H" H

H" "H8)r   r   r   r   r6   r	   r9   rK   r<   r>   r@   r   r   rB   r   r   s   @r   r   r   y  s        **    #? # # # #
d 
 
 
 
T     * * *. T*<%= .  .  .  .  .  .  .  . r   r   c                       e Zd ZdZd ZdefdZdee         fdZ	de
defdZdee         fdZdee         fd	Zdefd
ZdS )GpuMetricProviderz*Provider class for GPU metrics collection.c                 r    d | _         d| _        t                      t                      g| _        d| _        d S )NTF)	_provider_enable_metric_reportrN   r   
_providersr3   r4   s    r   r6   zGpuMetricProvider.__init__  s7    04%)",..0@0@A!r   r7   c                    | j         rdS |                                 | _        | j        W	 t                      }|                                 ng# t
          $ r&}|                     |          rd| _        Y d}~n<d}~ww xY wt          	                    dt          | j                  j                    d| _         | j        duS )zHInitialize the GPU metric provider by detecting available GPU providers.TNFzUsing GPU Provider: )r3   _detect_gpu_providerr   rN   r>   r^   _should_disable_gpu_checkr   r_   r   typer   )r5   nvidia_providerrb   s      r   
initializezGpuMetricProvider.initialize  s     	42244>!7"3"5"5++---- 7 7 711!44 716D.7 KKNtDN/C/C/LNNOOO ~T))s   "A 
A>A99A>c                 J    | j         D ]}|                                r|c S dS )z3Detect and return the first available GPU provider.N)r   r<   )r5   providers     r   r   z&GpuMetricProvider._detect_gpu_provider  s:     	  	 H$$&&   tr   nvidia_errorc                     t          |          j        dk    rdS 	 t          j        ddt          j                  }t          t          |                    dk    S # t          $ r Y dS w xY w)a  
        Check if we should disable GPU usage check based on the error.

        On machines without GPUs, pynvml.nvmlInit() can run subprocesses that
        spew to stderr. Then with log_to_driver=True, we get log spew from every
        single raylet. To avoid this, disable the GPU usage check on certain errors.

        See: https://github.com/ray-project/ray/issues/14305
        NVMLError_DriverNotLoadedFz+cat /sys/module/amdgpu/initstate |grep liveT)shellstderrr   )r   r   
subprocesscheck_outputDEVNULLlenr'   r^   )r5   r   results      r   r   z+GpuMetricProvider._should_disable_gpu_check   s     &*EEE5
	,=!)  F s6{{##q(( 	 	 	55	s   A A 
A+*A+c                 >   | j         sg S | j        s|                                  | j        g S 	 | j                                        }|S # t
          $ rH}t                              d| j                                        j	         d|            g cY d}~S d}~ww xY w)z6Get GPU usage information from the available provider.NzError getting GPU usage from z: )
r   r3   r   r   rB   r^   r_   r`   r9   value)r5   gpu_info_listrb   s      r   get_gpu_usagezGpuMetricProvider.get_gpu_usage  s    ) 	I  	OO>!I	 N>>@@M   	 	 	LL_0P0P0R0R0X__\]__   IIIIII		s   A
 

B=BBBc                 P    | j         r| j                                         j        ndS )z)Get the name of the current GPU provider.N)r   r9   r   r4   s    r   r9   z#GpuMetricProvider.get_provider_name-  s%    ;?>St~//1177tSr   c                     | j         S )z)Check if GPU metric reporting is enabled.)r   r4   s    r   is_metric_report_enabledz*GpuMetricProvider.is_metric_report_enabled1  s    ))r   N)r   r   r   r   r6   rK   r   r   r1   r   r^   r   r   r   r   r'   r9   r   r   r   r   r   r     s        44" " "*D * * * *,h{&;    i D    2t$67    (T8C= T T T T*$ * * * * * *r   r   )r   rI   enumloggingr   r   typingr   r   r   r   r   	getLoggerr   r_   r   r   r   r   r/   Enumr	   r   r   r)   ABCr1   rN   r   r   r   r   r   <module>r      s    


        9 9 9 9 9 9 9 9 9 9 9 9 9 9		8	$	$  
	    di   * * * * *Y * * *	8 	8 	8 	8 	8 	8 	8 	8	 	 	 	 	 	 	 	$ $ $ $ $#' $ $ $NN N N N N N N Nb^  ^  ^  ^  ^ [ ^  ^  ^ BY* Y* Y* Y* Y* Y* Y* Y* Y* Y*r   