
    &`iA                     (   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ  e j        e          Z ed	          e G d
 de                                  Zd Zd Zd Zd Zd Z G d de          ZdS )    N)	dataclass)BaseWorkerGroup)get_address_and_port)Backend)TorchConfig)	PublicAPIalpha)	stabilityc                   8    e Zd ZU dZdZeed<   ed             ZdS )TorchXLAConfigz
    Configuration for torch XLA setup.
    See https://pytorch.org/xla/release/1.13/index.html for more info.
    Currently, only "neuron_cores" accelerator (AwsNeuronXLABackend)
    is supported with xrt runtime.
    Fneuron_parallel_compilec                     t           S )N)_TorchAwsNeuronXLABackend)selfs    n/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/train/torch/xla/config.pybackend_clszTorchXLAConfig.backend_cls   s    ((    N)	__name__
__module____qualname____doc__r   bool__annotations__propertyr    r   r   r   r      sL           %*T)))) ) X) ) )r   r   c                  <    dd l } |                     g d           d S )Nr   )pkillz-fxrt_run_server)
subprocesscall)r   s    r   _kill_xrt_serverr!   #   s-    OO55566666r   c                  >   t           j                                        } t          |                                           t
          j        d<   t          |                                           t
          j        d<   t          |                                           t
          j        d<   t          | 	                                          t
          j        d<   t          | 
                                          t
          j        d<   t          | 	                                |                                 z            t
          j        d<   t          |                                           t
          j        d<   t          |                                           t
          j        d<   t          | 	                                          t
          j        d	<   d
t
          j        d<   dt
          j        d<   dt
          j        d<   dt
          j        d<   dt
          j        d<   d S )N
LOCAL_RANKRANKLOCAL_WORLD_SIZE
WORLD_SIZE
GROUP_RANKGROUP_WORLD_SIZE	ROLE_RANKROLE_WORLD_RANKROLE_WORLD_SIZEefaFI_PROVIDER1FI_EFA_USE_DEVICE_RDMAFI_EFA_FORK_SAFEXLA_TRANSFER_SEED_ASYNCNCCL_ASYNC_ERROR_HANDLING)raytrainget_contextstrget_local_rankosenvironget_world_rankget_local_world_sizeget_world_sizeget_node_rank)contexts    r   _set_xla_env_varsr?   )   s   i##%%G"7#9#9#;#;<<BJ|W335566BJv%()E)E)G)G%H%HBJ!""7#9#9#;#;<<BJ|"7#8#8#:#:;;BJ|%(  7#?#?#A#AA& &BJ!" "'"8"8":":;;BJ{$'(>(>(@(@$A$ABJ !$'(>(>(@(@$A$ABJ !
 !&BJ}+.BJ'(%(BJ!",/BJ().1BJ*+++r   c                      	 dd l m}  dd lmc m} dd l}|                     d           d S # t          $ r t          d          w xY w)Nr   xlaz5torch_xla must be installed to use torch_xla backend.)torch.distributeddistributedtorch_xla.core.xla_modelcore	xla_model!torch_xla.distributed.xla_backendinit_process_groupImportError)distxm	torch_xlas      r   _setup_xla_torch_process_grouprM   C   s    S((((((---------0000&&&&& S S SQRRRSs	   (, Ac                  `    dt           j        d<   dt           j        d<   dt           j        d<   d S )Nr.   NEURON_PARALLEL_COMPILENEURON_EXTRACT_GRAPHS_ONLYNEURON_FALL_BACK_TO_NULL_NEFF)r8   r9   r   r   r   %_set_neuron_parallel_compile_env_varsrR   P   s/    ,/BJ()/2BJ+,25BJ.///r   c                     	 ddl m}  ddlm} n# t          $ r t	          d          w xY wt
          j                            d          dk    rt          	                    d           dt
          j                            d	d
           d}t
          j
                            |          rt          j        |           t          j        |d           d }t
          j                            d          x}r,t          j        d|          x}r|                    d          } |||                     |                     d S d S )Nr   )CacheUrl)parallel_compilezBlibneuronxla must be installed to use Neuron parallel compilation.r#   0z0Compiling extracted graphs on local rank0 workerz/tmp/USERzno-userz/parallel_compile_workdir/T)exist_okNEURON_CC_FLAGSz--cache_dir[= ](\S+)   )libneuronxla.neuron_cc_cacherT   $libneuronxla.neuron_parallel_compilerU   rI   r8   r9   getloggerinfopathexistsshutilrmtreemakedirsresearchgroupget_cache_url)rT   rU   parallel_compile_workdirexplicit_cache_dirneuron_cc_flagsss         r    _neuron_compile_extracted_graphsrm   W   s|   
999999IIIIIII 
 
 
P
 
 	

 
z~~l##s**FGGG QBJNN6)44PPP 	! 7>>233 	4M2333
,t<<<< " jnn->???? 	0I5GGGq 0%&WWQZZ"$""#566	
 	
 	
 	
 	
# +*s    )c                   z    e Zd ZU  e ej                              Zeed<   dede	fdZ
dede	fdZdede	fdZdS )r   unique_run_idworker_groupbackend_configc                 ,    |                     t                     |                    dt                    \  }} fd}|                     |||           |j        r6t
                              d           |                     t                     dS dS )z+Logic ran right before training is started.r   c                     | t           j        d<   t          |          t           j        d<   j        t           j        d<   d S )NMASTER_ADDRMASTER_PORTTORCHELASTIC_RUN_ID)r8   r9   r6   ro   )addrportr   s     r   set_env_varsz8_TorchAwsNeuronXLABackend.on_start.<locals>.set_env_vars   s7    (,BJ}%(+D		BJ}%040BBJ,---r   )rw   rx   z1Extracting graphs for Neuron parallel compilationN)executer!   execute_singler   r   r^   r_   rR   )r   rp   rq   master_addrmaster_portry   s   `     r   on_startz"_TorchAwsNeuronXLABackend.on_start{   s    
 	-... $0#>#>qBV#W#W [	C 	C 	C 	C 	C 	\+NNN 1 	HKKKLLL  !FGGGGG	H 	Hr   c                 n    |                     t                     |                     t                     dS )z
        Configure the environment variables for the worker group.
        And initialize the xla distributed process group.
        TODO: Current setup only supports homogenous cluster with
         neuron_cores accelerator and xrt runtime.
        N)rz   r?   rM   r   rp   rq   s      r   on_training_startz+_TorchAwsNeuronXLABackend.on_training_start   s4     	.///;<<<<<r   c                     |                     t                     |j        r|                     t                     dS dS )z
        Logic ran right after training is finished.
        This is a sanity cleanup to kill xrt server, and to optionally
        run neuron parallel graph compilation
        N)rz   r!   r   rm   r   s      r   on_shutdownz%_TorchAwsNeuronXLABackend.on_shutdown   sN     	-... 1 	C  !ABBBBB	C 	Cr   N)r   r   r   r6   uuiduuid4ro   r   r   r   r~   r   r   r   r   r   r   r   x   s         ZTZ\\**M3***H_ Hn H H H H0
=+
==K
= 
= 
= 
=C+C=KC C C C C Cr   r   )loggingr8   re   rb   r   dataclassesr   r3   %ray.train._internal.base_worker_groupr   ray.train._internal.utilsr   ray.train.backendr   ray.train.torchr   ray.utilr   	getLoggerr   r^   r   r!   r?   rM   rR   rm   r   r   r   r   <module>r      s    				 				   ! ! ! ! ! ! 



 A A A A A A : : : : : : % % % % % % ' ' ' ' ' '      		8	$	$ W
) ) ) ) )[ ) )  )7 7 72 2 24S S S6 6 6
 
 
B3C 3C 3C 3C 3C 3C 3C 3C 3C 3Cr   