
    &`i!                        d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlZd dl	m
Z d dlmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZm Z  d dl!m"Z"  e j#        e$          Z% G d d          Z& e"d          e G d de                                  Z'	 d!de(de)de)de(de)f
dZ*d"dZ+d Z, G d d e          Z-dS )#    N)	dataclass)	timedelta)Optional)Version)build_address)ray_constants)"register_custom_torch_dist_backend)GetTimeoutError)BaseWorkerGroup)get_address_and_port)BackendBackendConfig).DEFAULT_TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S&TORCH_PROCESS_GROUP_SHUTDOWN_TIMEOUT_S)	PublicAPIc                       e Zd Zd Zd ZdS )TorchConfigContextManagerc                     t           j                                        rOt          j        j                                         }|j        dk    r#t           j                            |           d S d S d S )Ncuda)torchr   is_availableraytrain
get_devicetype
set_device)selfdevices     j/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/ray/train/torch/config.py	__enter__z#TorchConfigContextManager.__enter__   sf    :""$$ 	.Y_//11F{f$$
%%f-----	. 	.$$    c                     dS )NF )r   r   value	tracebacks       r   __exit__z"TorchConfigContextManager.__exit__$   s    ur!   N)__name__
__module____qualname__r    r&   r#   r!   r   r   r      s2        . . .    r!   r   stable)	stabilityc                   v    e Zd ZU dZdZee         ed<   dZeed<   dZ	e
ed<   ed             Zed	             ZdS )
TorchConfiga  Configuration for torch process group setup.

    See https://pytorch.org/docs/stable/distributed.html for more info.

    Args:
        backend: The backend to use for training.
            See ``torch.distributed.init_process_group`` for more info and
            valid values.
            If set to None, nccl will be used if GPUs are requested, else gloo
            will be used.
        init_method: The initialization method to use. Either "env"
            for environment variable initialization or "tcp" for TCP
            initialization. Defaults to "env".
        timeout_s: Seconds for process group operations to timeout.
    Nbackendenvinit_method  	timeout_sc                     t           S N)_TorchBackendr   s    r   backend_clszTorchConfig.backend_cls@   s    r!   c                     t           S r4   )r   r6   s    r   train_func_contextzTorchConfig.train_func_contextD   s    ((r!   )r'   r(   r)   __doc__r.   r   str__annotations__r0   r2   intpropertyr7   r9   r#   r!   r   r-   r-   )   s            "GXc]!!!KIs  X ) ) X) ) )r!   r-   r1   r.   
world_rank
world_sizer0   r2   c           	      b   |dk    r%t                               d| d| d| d           n$t                               d| d| d| d           t                               d|             | dk    rt          t          j                  t          d          k     rd	}d
}nd}d}|t          j        vr>|t          j        vr0t                               d| d| d           dt          j        |<   n| dk    rt          |            t          j
        | |||t          |                     dS )a{  Connects the distributed PyTorch backend.

    Args:
        backend: The backend (nccl, gloo, etc.) to use for training.
        world_rank: Rank of the current worker.
        world_size: Number of workers participating in the job.
        init_method: URL specifying how to initialize the process group.
        timeout_s: Seconds for process group operations to timeout.
    r   zSetting up process group for: z [rank=z, world_size=]zusing ncclz2.2.0NCCL_ASYNC_ERROR_HANDLINGNCCL_BLOCKING_WAITTORCH_NCCL_ASYNC_ERROR_HANDLINGTORCH_NCCL_BLOCKING_WAITzSetting zn=1 to fail if NCCL collective communication operations are timing out. To override this behavior, you can set z=0.1hccl)seconds)r.   r0   rankr@   timeoutN)loggerinfodebugr   r   __version__osenvironr	   distinit_process_groupr   )r.   r?   r@   r0   r2   'TORCH_NCCL_ASYNC_ERROR_HANDLING_ENV_VAR TORCH_NCCL_BLOCKING_WAIT_ENV_VARs          r   _setup_torch_process_grouprW   I   s     Q([ ( ( ( ($( ( (	
 	
 	
 	

 	([ ( ( ( ($( ( (	
 	
 	
 LL#'##$$$& 5$%%(8(8886Q3/C,,6W3/I,32:EE0
BBLLgB g g:ag g g   CFBJ>?	F		*7333),,,     r!   Fc                 D   ddl m}  |            }| rt          j                     t          j                                        rZ|D ]Y}t          j                            |          5  t          j                                         d d d            n# 1 swxY w Y   Xd S d S )Nr   )get_devices)	ray.air._internal.torch_utilsrY   rS   destroy_process_groupr   r   r   r   empty_cache)r[   rY   devicesr   s       r   _shutdown_torchr^      s    999999kmmG %"$$$z   ) 	) 	)F""6** ) )
&&((() ) ) ) ) ) ) ) ) ) ) ) ) ) )) )	) 	)s   'BB	B	c                  f   ddl m}  t          j                                        }t          |                                          t          j        d<   t          |	                                          t          j        d<   t          |
                                          t          j        d<   t          |                                          t          j        d<   t          |                                          t          j        d<    |             }t          |          t          j        d<   d S )	Nr   )r   
LOCAL_RANKRANKLOCAL_WORLD_SIZE
WORLD_SIZE	NODE_RANKACCELERATE_TORCH_DEVICE)ray.train.torchr   r   r   get_contextr;   get_local_rankrQ   rR   get_world_rankget_local_world_sizeget_world_sizeget_node_rank)r   contextr   s      r   _set_torch_distributed_env_varsrn      s     +*****i##%%G"7#9#9#;#;<<BJ|W335566BJv%()E)E)G)G%H%HBJ!""7#9#9#;#;<<BJ|!'"7"7"9"9::BJ{ Z\\F,/KKBJ()))r!   c                   J    e Zd ZU dZeed<   dedefdZdefdZ	dede
fdZdS )	r5   Tshare_cuda_visible_devicesworker_groupbackend_configc                    t          j                    r?|j        6|                                }|                    dd          }|dk    rd}n
d}n|j        }|                    dt                    \  }}|j        dk    rd }|                    |||           d}	n7|j        d	k    rd
t          ||           }	nt          d|j         d          g }
t          t          |                    D ]H}|
                    |                    |t          ||t          |          |	|j                             It#          j        |
           d S t%          d          )NGPUr   rC   gloor/   c                 \    | t           j        d<   t          |          t           j        d<   d S )NMASTER_ADDRMASTER_PORT)rQ   rR   r;   addrports     r   set_env_varsz,_TorchBackend.on_start.<locals>.set_env_vars   s&    04BJ}-03D		BJ}---r!   ry   zenv://tcpztcp://zThe provided init_method (z2) is not supported. Must be either 'env' or 'tcp'.)r.   r?   r@   r0   r2   z#Distributed torch is not available.)rS   r   r.   get_resources_per_workergetexecute_singler   r0   executer   
ValueErrorrangelenappendexecute_single_asyncrW   r2   r   RuntimeError)r   rq   rr   	resourcesnum_gpus_per_workerr.   master_addrmaster_portr|   urlsetup_futuresis               r   on_startz_TorchBackend.on_start   s    0	F%-(AACC	&/mmE1&=&=#&**$GG$GG(0'3'B'B'( ($K )U22: : : $$\+$VVV+u44H}[+FFHH 1%11 1 1   M3|,,--  $$ 552 '#$#&|#4#4$'"0": 6  
 
 
 
 GM"""""DEEEr!   c                 *   |                     t          t          |          dk              }t          j        t
          t                    }	 t          j        ||           d S # t          $ r" t                              d| d           Y d S w xY w)N   )r[   )rL   z-Torch process group shutdown timed out after z seconds)execute_asyncr^   r   r   env_integerr   r   r   r   r
   rM   warning)r   rq   rr   futuresr2   s        r   on_shutdownz_TorchBackend.on_shutdown   s    ,,"%l"3"3a"7 - 
 
 "-2:
 
		GGY////// 	 	 	NNS	SSS     	s   A& &(BBc                 :    |                     t                     d S r4   )r   rn   )r   rq   rr   s      r   on_training_startz_TorchBackend.on_training_start   s     	<=====r!   N)r'   r(   r)   rp   boolr<   r   r-   r   r   r   r   r#   r!   r   r5   r5      s         '++++1F_ 1Fk 1F 1F 1F 1Ff     >+>=J> > > > > >r!   r5   )r1   )F).loggingrQ   dataclassesr   datetimer   typingr   r   torch.distributeddistributedrS   packaging.versionr   r   ray._common.network_utilsr   ray._privater    ray.air._internal.device_managerr	   ray.exceptionsr
   %ray.train._internal.base_worker_groupr   ray.train._internal.utilsr   ray.train.backendr   r   ray.train.constantsr   r   ray.utilr   	getLoggerr'   rM   r   r-   r;   r=   rW   r^   rn   r5   r#   r!   r   <module>r      sz    				 ! ! ! ! ! !                          % % % % % % 



 3 3 3 3 3 3 & & & & & & O O O O O O * * * * * * A A A A A A : : : : : : 4 4 4 4 4 4 4 4             		8	$	$
 
 
 
 
 
 
 
 X
) ) ) ) )- ) )  )F 7 777 7 	7
 7 7 7 7t	) 	) 	) 	)8 8 8"I> I> I> I> I>G I> I> I> I> I>r!   