
    `i                    |   U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZm Z m!Z!m"Z" d d	l#m$Z$ d dl%Z%d dl&Z%d dl'Z%d dl(m)Z* d dl+m,Z, d d
l-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZC  ejD        eE          ZFeFG                    ejH                   g dZIddgZJe:pe;pe>ZK G d de           ZLi d eLdd          d eLdd          d eLdd          d eLdd          d  eLd!d"          d# eLd$d%          d& eLd'd(          d) eLd*d+          d, eLd-d.          d/ eLd0d1          d2 eLd3d4          d5 eLd6d7          d8 eLd9d:          d; eLd<d=          d> eLd?d@          dA eLdBdC          dD eLdEdF          dG eLdHdI          iZMe G dJ dK                      ZNdL ZOdM ZPdN ZQdO ZRdP ZSdQ ZTdR ZUdS ZVdT ZWdU ZXdV ZYdW ZZdX Z[dY Z\dZ Z]d[ Z^d\ Z_dd]Z`d^ Zad_ Zbd` Zcdae%jd        dbeedceeddeffdeZge7dfdgdh edij          dhdkdhfdl            Zhe=rdmZin ee ejj        dndo                    ZidpdqiZke<rdrekds<   ddteffduZlddeefdvZmedw             Znddxeedyeedzeefd{Zodyeed|epfd}Zqdare!e	js                 etd~<   dde!ep         dddfdZuddZvdZw G d de?          Zx G d dex          Zydezepe{e         f         dedefdZ|da}ddeffdZ~d ZdeiewfdZ G d de?          Z G d de,j                  Z G d de,j                  Ze	 dd            Z G d de%j        j        j?                  Z G d dey          Z G d de?          ZdS )    N)contextmanager)	dataclass)	timedelta)Enum)partialreducewraps)StringIO)AnyCallable
NamedTupleOptionalUnion)patch)
DeviceType)_SymmetricMemory)	trace_log)FILE_SCHEMAfind_free_portIS_SANDCASTLEretry_on_connect_failuresskip_but_pass_in_sandcastleskip_but_pass_in_sandcastle_if	TEST_CUDATEST_HPUTEST_WITH_ROCMTEST_WITH_TSANTEST_XPUTestCase)_install_threaded_pg_uninstall_threaded_pgProcessLocalGroupncclxcclhcclcudaxpuc                   $    e Zd ZU eed<   eed<   dS )TestSkip	exit_codemessageN)__name__
__module____qualname__int__annotations__str     ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/torch/testing/_internal/common_distributed.pyr*   r*   @   s"         NNNLLLLLr4   r*   backend_unavailableH   z5Skipped because distributed backend is not available.small_worldsizeI   z Skipped due to small world size.odd_worldsizeW   zSkipped due to odd world size.no_cudaJ   zCUDA is not available.zmulti-gpu-1K   zNeed at least 1 CUDA devicezmulti-gpu-2M   zNeed at least 2 CUDA deviceszmulti-gpu-3P   zNeed at least 3 CUDA deviceszmulti-gpu-4Q   zNeed at least 4 CUDA deviceszmulti-gpu-5R   zNeed at least 5 CUDA deviceszmulti-gpu-6S   zNeed at least 6 CUDA deviceszmulti-gpu-7T   zNeed at least 7 CUDA deviceszmulti-gpu-8U   zNeed at least 8 CUDA devicesr$   L   z#c10d not compiled with NCCL support
skipIfRocmN   zTest skipped for ROCmno_peer_accessO   z'Test skipped because no GPU peer accessgenericV   zHTest skipped at subprocess level, look at subprocess log for skip reasonimporterrorX   z"Test skipped due to missing importno_acceleratorY   zaccelerator is not available.c                       e Zd Zi Zh ded<    e            ed<   h ded<   h ded<   i Zh ded<   h ded	<   h ded
<   h ded<    e            ed<   erdhed<   erdhed<   dS dS )DistTestCases>   mpiuccr$   r%   allgather_coalescedr   >   rT   r$   r%   zsendrecv anysourcezcpu barrier>   rT   gloor$   gpur'   ddpsubgrouppluginr&   hpur%   r(   N)r-   r.   r/   skip_collectivesetbackend_featurer   r   r3   r4   r5   rR   rR   _   s         O-K-K-KO)* #OH,C,C,CO()%<%<%<OM" O444OE555OF444OE"9"9"9OJ #OH *"( *"(* *r4   rR   c                     | t           v S N)DDP_RANK_DEVICES)devices    r5   requires_ddp_rankrc   u   s    %%%r4   c                 <     t                      fd            }|S )zSkips if the world size exceeds the number of GPUs, ensuring that if the
    test is run, each rank has its own GPU via ``torch.cuda.device(rank)``.c                     t           s2t          s+t          s$t          j        t
          d         j                   t          t          j	        d                   }t           rIt          j                                        |k     r't          j        t
          d|          j                   t          rIt          j                                        |k     r't          j        t
          d|          j                   t          rIt          j                                        |k     r't          j        t
          d|          j                    | i |S )Nr<   
WORLD_SIZE
multi-gpu-)r   r   r   sysexit
TEST_SKIPSr+   r0   osenvirontorchr'   device_countr[   r(   )argskwargs
world_sizefuncs      r5   wrapperzskip_if_no_gpu.<locals>.wrapper}   s     	6X 	6 	6HZ	*4555L122
 	F0022Z??HZ 9Z 9 9:DEEE 	F	..00:==HZ 9Z 9 9:DEEE 	F	..00:==HZ 9Z 9 9:DEEEtT$V$$$r4   r	   rr   rs   s   ` r5   skip_if_no_gpurv   y   s5     4[[% % % % [% Nr4   c                 <     t                      fd            }|S )Nc                      t           j        d         dk    rGt          t           j        d                   dk     r$t          j        t
          d         j                    | i |S )NBACKENDrS   rf      r8   rk   rl   r0   rh   ri   rj   r+   ro   rp   rr   s     r5   rs   z(skip_if_small_worldsize.<locals>.wrapper   s]    Jy!U**BJ|4L0M0MPQ0Q0QHZ 12<===tT$V$$$r4   rt   ru   s   ` r5   skip_if_small_worldsizer}      3    
4[[% % % % [% Nr4   c                 <     t                      fd            }|S )Nc                      t           j        d         dk    rJt          t           j        d                   dz  dk    r$t          j        t
          d         j                    | i |S )Nry   rS   rf         r:   r{   r|   s     r5   rs   z&skip_if_odd_worldsize.<locals>.wrapper   sb    Jy!U**BJ|4L0M0MPQ0QUV0V0VHZ0:;;;tT$V$$$r4   rt   ru   s   ` r5   skip_if_odd_worldsizer      r~   r4   c                       fd}|S )Nc                 B     t                      fd            }|S )Nc                      dk    rKt           j                                        k     r)t          j        t
          d          j                   d S  | i |S Nr$   rg   )rm   r'   rn   rh   ri   rj   r+   )ro   rp   backendrr   ns     r5   rs   zCrequire_n_gpus_for_nccl_backend.<locals>.decorator.<locals>.wrapper   sb    &  UZ%<%<%>%>%B%B$4$4$45?@@@@@tT,V,,,r4   rt   )rr   rs   r   r   s   ` r5   	decoratorz2require_n_gpus_for_nccl_backend.<locals>.decorator   s>    	t	- 	- 	- 	- 	- 	- 
	- r4   r3   )r   r   r   s   `` r5   require_n_gpus_for_nccl_backendr      s*          r4   c                      d } | S )Nc                 <     t                      fd            }|S )Nc                      	 ddl m}m}  | i |S # t          $ r( t	          j        t          d         j                   Y d S w xY w)Nr   )AutoModelForMaskedLM
BertConfigrM   )transformersr   r   ImportErrorrh   ri   rj   r+   )ro   rp   r   r   rr   s       r5   rs   z?import_transformers_or_skip.<locals>.decorator.<locals>.wrapper   sq    >IIIIIIIItT,V,,, > > >M2<======>s    .AArt   ru   s   ` r5   r   z.import_transformers_or_skip.<locals>.decorator   s3    	t	> 	> 	> 	> 
	> r4   r3   )r   s    r5   import_transformers_or_skipr      s    
 
 
 r4   c                    t           r$t          j                                        | k    rdS t          r$t          j                                        | k    rdS t          r$t          j                                        | k    rdS dS NTF)r   rm   r'   rn   r   r[   r   r(   )xs    r5   at_least_x_gpur      sv     UZ,,..!33t EI**,,11t EI**,,11t5r4   c                       fd}|S )Nc                 @     t                      fd            }|S )Nc                     t           j                                        r*t           j                                        k    r | i |S t          r*t           j                                        k    r | i |S t          r*t           j                                        k    r | i |S t          j	        t          d          j                   d S )Nrg   )rm   r'   is_availablern   r   r[   r   r(   rh   ri   rj   r+   )ro   rp   rr   r   s     r5   rs   z4skip_if_lt_x_gpu.<locals>.decorator.<locals>.wrapper   s    z&&(( -UZ-D-D-F-F!-K-KtT,V,,, -EI224499tT,V,,, -EI224499tT,V,,,HZ 0Q 0 01;<<<<<r4   rt   )rr   rs   r   s   ` r5   r   z#skip_if_lt_x_gpu.<locals>.decorator   s9    	t	= 	= 	= 	= 	= 
	= r4   r3   )r   r   s   ` r5   skip_if_lt_x_gpur      s$         r4   c                       fd}|S )Nc                 B     t                      fd            }|S )Nc                     dk    r | i |S t           j                                        r*t           j                                        k    r | i |S t	          j        t          d          j                   d S r   )rm   r'   r   rn   rh   ri   rj   r+   )ro   rp   r   rr   r   s     r5   rs   z9nccl_skip_if_lt_x_gpu.<locals>.decorator.<locals>.wrapper   s    &  tT,V,,,z&&(( -UZ-D-D-F-F!-K-KtT,V,,,HZ 0Q 0 01;<<<<<r4   rt   )rr   rs   r   r   s   ` r5   r   z(nccl_skip_if_lt_x_gpu.<locals>.decorator   s>    	t	= 	= 	= 	= 	= 	= 
	= r4   r3   )r   r   r   s   `` r5   nccl_skip_if_lt_x_gpur      s*    	 	 	 	 	 	 r4   c                     |                                  }d|v sJ d|v sJ d|v sJ |d         }|                    d          dk    r|n|                    d          d         }||v sJ d| d|             d S )	N	iteration	has_errorerrorz
Exception raised from r   zDid not find expected z in ddp logging data error: )_get_ddp_logging_datafindsplit)	model_DDP
err_substrddp_logging_datalogging_erractuals        r5   verify_ddp_error_loggedr      s     6688**********&&&&&"7+K ??566"<< 	
899!< 
 [   RRR[RR !    r4   c                 <     t                      fd            }|S )aJ  
    Convenience decorator to set/unset TORCH_NCCL_BLOCKING_WAIT flag. Note that use of
    this decorator will override the setting of TORCH_NCCL_ASYNC_ERROR_HANDLING for
    the particular test. After the test, both TORCH_NCCL_BLOCKING_WAIT and
    TORCH_NCCL_ASYNC_ERROR_HANDLING will be restored to their original values.
    c                     	 t           j        d         }t           j        d= n# t          $ r d }Y nw xY w	 t           j        d         }n# t          $ r d }Y nw xY wdt           j        d<   n# dt           j        d<   w xY w	  | i |}|||t           j        d<   ||t           j        d<   S S # ||t           j        d<   ||t           j        d<   w xY w)NTORCH_NCCL_ASYNC_ERROR_HANDLINGTORCH_NCCL_BLOCKING_WAIT1)rk   rl   KeyError)ro   rp    cached_nccl_async_error_handlingcached_nccl_blocking_waitretrr   s        r5   rs   z(with_nccl_blocking_wait.<locals>.wrapper  sH   	4AC1B, 
<== 	4 	4 	4/3,,,	4
	9:<**;%%  	- 	- 	-(,%%%	- 69BJ122SBJ128888	S$'''C 0;4 
<= )49R
566 5 0;4 
<= )49R
56RRRRsA   # 22A	 A+ 	AA+ AA+ +A< 	B- -$Crt   ru   s   ` r5   with_nccl_blocking_waitr     s;     4[[ S  S  S  S [ SD Nr4   c                       fd}|S )zK
    Runs a test for each distributed debug level specified in levels.
    c                 @     t                      fd            }|S )Nc                      t           j                            dd           }D ]P}|t           j        d<   t          j                      | i |}t          j                     ||t           j        d<   Q|S )NTORCH_DISTRIBUTED_DEBUG)rk   rl   getc10dset_debug_level_from_envbarrier)ro   rp   	old_levellevelr   rr   levelss        r5   rs   z:with_dist_debug_levels.<locals>.decorator.<locals>.wrapper9  s    
'@$GGI F F8=
45-///dD+F++(<EBJ89 Jr4   rt   )rr   rs   r   s   ` r5   r   z)with_dist_debug_levels.<locals>.decorator8  s9    	t	 	 	 	 	 
	 r4   r3   )r   r   s   ` r5   with_dist_debug_levelsr   3  s$    
    $ r4   c                  F    t          t          j                     d          S )Nz+c10d was not compiled with the Gloo backend)r   r   is_gloo_availabler3   r4   r5   requires_gloor   M  &    )"$$$5  r4   c           	         t           rt          j                    st          d          S t	          t
          j        j                                        | k     d|  dt
          j        j                                         d|           S d }|S )N+c10d was not compiled with the NCCL backendz0Requires NCCL version greater than or equal to: z	, found: z
, reason: c                 <     t                      fd            }|S )Nc                       | i |S r`   r3   r|   s     r5   rs   z9requires_nccl_version.<locals>.decorator.<locals>.wrapperb  s    tT,V,,,r4   rt   ru   s   ` r5   r   z(requires_nccl_version.<locals>.decoratora  s3    4[[- - - - [- Nr4   )	r   r   is_nccl_availabler   r   rm   r'   r$   version)r   msgr   s      r5   requires_nccl_versionr   T  s     %'' 	.=   2
''))G37UZU_UdUlUlUnUnz}  	 	 	 r4   c                  F    t          t          j                     d          S )Nr   )r   r   r   r3   r4   r5   requires_ncclr   k  r   r4   c                  F    t          t          j                     d          S )Nz*c10d was not compiled with the UCC backend)r   r   is_ucc_availabler3   r4   r5   requires_uccr   r  &    )!###4  r4   c                  F    t          t          j                     d          S )Nz*c10d was not compiled with the MPI backend)r   r   is_mpi_availabler3   r4   r5   requires_mpir   y  r   r4   c                 n    | t           } t          d | D                       }t          | d|            S )a  
    Decorator to skip tests if no accelerator communication backend (NCCL, XCCL, HCCL) is available.

    Args:
        backends (Optional[List[str]]): Specific accelerator backends to check (e.g., ["nccl", "xccl", "hccl"]).
                                       If None, checks all supported accelerator backends (NCCL, XCCL, HCCL).

    Returns:
        callable: A decorator that skips the test if no specified accelerator backend is available.
    Nc              3      K   | ]<}	 t          j        t           j        d  d                    |d                       V  =dS )c                      t           S r`   )r   r3   r4   r5   <lambda>z=requires_accelerator_dist_backend.<locals>.<genexpr>.<lambda>  s    H r4   r#   c                      dS NFr3   r3   r4   r5   r   z=requires_accelerator_dist_backend.<locals>.<genexpr>.<lambda>  s    u r4   N)r   r   is_xccl_availabler   ).0r   s     r5   	<genexpr>z4requires_accelerator_dist_backend.<locals>.<genexpr>  sq         	&**$$	
 	
 #g}}
%
%		( 	(     r4   z5No accelerator communication backend available among )ACCELERATOR_DIST_BACKENDSanyr   )backendsbackend_availables     r5   !requires_accelerator_dist_backendr     sb     ,         *JJJ  r4   c                      t           j                                        ot          j        t
          j        d          } t          |  d          S )Nr   z"multicast support is not available)rm   r'   r   r   has_multicast_supportr   CUDAr   )r   s    r5   requires_multicast_supportr     sM    
!! 	G2:?AFF  *!!,  r4   c                 J     d _         t                      fd            }|S )zSkips a test for ROCmTc                  n    t           s | i |S t          j        t          d         j                   d S )NrG   )r   rh   ri   rj   r+   r|   s     r5   rs   z*skip_if_rocm_multiprocess.<locals>.wrapper  s>     	)4((((L)344444r4   )skip_if_rocm_multiprocessr	   ru   s   ` r5   r   r     s;    %)D"
4[[5 5 5 5 [5
 Nr4   c                  >    t          t          j        dk    d          S )Nwin32z8This unit test case is not supported on Windows platform)r   rh   platformr3   r4   r5   skip_if_win32r     s!    )B  r4   rb   majorminorreturnc                     | j         dk    rdS t          j        j        dS t          j                            |           ||fk    S )z
    Returns True if the device's compute capability is (major, minor) or higher.
    Error out if the device is not a CUDA device.
    Returns False if device is a RoCM device.
    Returns True if device is a non-CUDA device.
    r'   TNF)typerm   r   hipr'   get_device_capability)rb   r   r   s      r5   sm_is_or_higher_thanr     sE     {ft}$u:++F33u~EEr4   	localhostr   T   )minutesFc                     t                      }|rHt          |t          d          z            }t          j        j                            | ||||          S t          j        | |||||          S )zL
    Creates a TCP store. Retries if the chosen port is already in use.
    r   )milliseconds)wait_for_workers	use_libuv)r   r0   r   rm   classes	dist_c10dTCPStorer   )	addrrq   	is_mastertimeoutr   	jit_classr  porttimeout_milliseconds	            r5   create_tcp_storer    s     D 
!'I1,E,E,E"EFF}&//$
I/B
 
 	
 }-
 
 
 	
r4   i  !DISTRIBUTED_TESTS_DEFAULT_TIMEOUT300test_ddp_uneven_inputsi     test_join_kwargs	lazy_initc                     t           j        dk    s| !t          j                            d|          S t          j                            | |          S )Nr   z	127.0.0.1)hostnamer  	interfacer  )rh   r   r   ProcessGroupGloocreate_devicer  s     r5   r  r    s`    
|w)"3$22 I 3 
 
 	
 $229 3 
 
 	
r4   c                 t    t                               |                     d          d         t                    S N.r   )TIMEOUT_OVERRIDEr   r   TIMEOUT_DEFAULT)test_ids    r5   get_timeoutr    s)    c 2 22 6HHHr4   c               #   H  K   t                      t                      }} t          j        t          j        }}	 | |ct          _        t          _        t          j        t          j        fV  ||ct          _        t          _        d S # ||ct          _        t          _        w xY wr`   )r
   rh   stdoutstderr)new_outnew_errold_outold_errs       r5   captured_outputr&    s~      zz8::WGz3:WG2!('
CJj#*$$$$!('
CJJJ'
CJ1111s   3B B!rankrq   
num_inputsc           
      D    ddt           dt           dt           dt           fd}dt           fd fd	t          |d
          t          |d
          t          |d
          t          |d          t          |d          t          |d          fD             S )z
    Generate a number of basic test cases for sparse reduction.
    These cover tensors with a varying number of sparse dimensions and a varying
    number of dense dimensions. The only reduction operation we support is sum.
    r   r   r'  rq   sparse_dims
dense_dimsc           	         t          j        t          j        | dz             d| dz   f          }|gd t          |          D             z   }t          |dz
            D ]C}t          j        |t          j        d| dz             f          }|                    |           Dt          j        | dz   gd t          |          D             z             }t          j        |||          S )Nr   c                     g | ]}d S r   r3   r   _s     r5   
<listcomp>z@simple_sparse_reduce_tests.<locals>.generate.<locals>.<listcomp>  s    ===a===r4   c                     g | ]}d S r.  r3   r/  s     r5   r1  z@simple_sparse_reduce_tests.<locals>.generate.<locals>.<listcomp>"  s    )G)G)G!)G)G)Gr4   )	rm   reshapearangerangecatzerosappendonessparse_coo_tensor)r'  rq   r*  r+  indicesshaper0  valuess           r5   generatez,simple_sparse_reduce_tests.<locals>.generate  s     -TAX 6 6D1HFF==5+<+<===={Q'' 	% 	%Ai%+a*B*B CDDGLL$$$$TAXJ)G)GU:5F5F)G)G)GGHH&w>>>r4   c                 n     t          t          j         fdt                    D                       S )Nc                 (    g | ]} |          S r3   r3   )r   r'  fnrq   s     r5   r1  zCsimple_sparse_reduce_tests.<locals>.compute_sum.<locals>.<listcomp>'  s%    NNND22dJ//NNNr4   )r   operatoraddr5  )rA  rq   s   ``r5   compute_sumz/simple_sparse_reduce_tests.<locals>.compute_sum%  s=    LNNNNNE*<M<MNNN
 
 	
r4   c                     g | ]>fd t                    D             fdt                    D             f?S )c                 :    g | ]} z  |z   z            S r3   r3   )r   irA  r(  r'  rq   s     r5   r1  z9simple_sparse_reduce_tests.<locals>.<listcomp>.<listcomp>,  sD        :$q(*z*ABB  r4   c                 .    g | ]} z            S r3   r3   )r   rG  rD  rA  r(  rq   s     r5   r1  z9simple_sparse_reduce_tests.<locals>.<listcomp>.<listcomp>0  s*    QQQ![[Z*455QQQr4   )r5  )r   rA  rD  r(  r'  rq   s    @r5   r1  z.simple_sparse_reduce_tests.<locals>.<listcomp>*  s              z**   RQQQQQQuZ?P?PQQQ	
  r4   )r*  r      )r+  )r   r   )r0   r   )r'  rq   r(  r>  rD  s   ``` @r5   simple_sparse_reduce_testsrJ    s    
? 
?s 
? 
?# 
?s 
? 
? 
? 
?
C 
 
 
 

       H!,,,H!,,,H!,,,H+++H+++H+++
   r4   r   c                 J   t           j                                        }t          rt           j                                        }t
          rt           j                                        }t          |          d| |k    r|| z  fdt          |           D             }|S )zMultigpu tests are designed to simulate the multi nodes with multi
    GPUs on each node. Nccl backend requires equal #GPUs in each process.
    On a single node, all visible GPUs are evenly
    divided to subsets, each process only uses a subset.
    r   c           	      R    i | ]#}|t          |z  |d z   z                     $S r   )list)r   rG  nGPUs_per_processvisible_devicess     r5   
<dictcomp>z(init_multigpu_helper.<locals>.<dictcomp>P  sM        	
4$5 5QBS8S STUU  r4   )rm   r'   rn   r   r[   r   r(   r5  )rq   r   nGPUsrank_to_GPUrO  rP  s       @@r5   init_multigpu_helperrT  >  s     J##%%E )	&&(( )	&&((EllO E!Z/    z""  K r4   tmp_dirinit_methodc                 Z   t          j                    at          j        t          j        d<   t	          j        t          j                            t          j        d                     t	          j        t          j                            t          j        d                     t          j                            t          j        d          }t	          j        |           | | t          j        d<   d S t          t          j                            |d          z   t          j        d<   d S )NTEMP_DIRr   test_dirinit_dirINIT_METHODshared_init_file)
tempfileTemporaryDirectoryrU  namerk   rl   mkdirpathjoinr   )rV  init_dir_paths     r5   initialize_temp_directoriesrd  Z  s    )++G$\BJzHRW\\',	22333HRW\\',
33444GLLz::MH]$/
=!!!$/"',,-3
 3
 %

=!!!r4   c                  J    t           t                                            d S d S r`   )rU  cleanupr3   r4   r5   cleanup_temp_dirrg  k  s&     r4      c            	       Z    e Zd ZdZdZdefdZedefd            Zede	fd            Z
d Z	 dd	ed
eddf fdZd fdZd fdZdefdZddZddZ G d de          Zede	fd            Zede	dededdfd            ZdeddfdZddZddZddZedefd            Z xZS ) MultiProcessTestCaser   
   r   c                     dS r   r3   selfs    r5   _should_stop_test_suitez,MultiProcessTestCase._should_stop_test_suite  s    ur4   c                     dS )NTr3   rm  s    r5   destroy_pg_upon_exitz)MultiProcessTestCase.destroy_pg_upon_exit  s    tr4   c                     t           S r`   DEFAULT_WORLD_SIZErm  s    r5   rq   zMultiProcessTestCase.world_size      !!r4   c                 b    t                    fd            }t          j        ||           S )Nc                 j    | j         | j        k    r|                                d S               d S r`   )r'  MAIN_PROCESS_RANK_join_processesrn  rA  s    r5   rs   z1MultiProcessTestCase.join_or_run.<locals>.wrapper  s<    yD222$$R(((((r4   r	   types
MethodTypern  rA  rs   s    ` r5   join_or_runz MultiProcessTestCase.join_or_run  A    	r	 	 	 	 
	 ...r4   runTestmethod_name
methodNameNc                 6   |dk    r|}t                                          |           	 t          | |          }t          | ||                     |                     d S # t
          $ r,}|dk    rt          d| j         d|           |Y d }~d S d }~ww xY wNr  zno such test method in : super__init__getattrsetattrr  AttributeError
ValueError	__class__rn  r  r  rA  er  s        r5   r  zMultiProcessTestCase.__init__      
 ""$K%%%		{++BD+t'7'7';';<<<<< 	 	 	Y&& !LdnLL
LL  '&&&&&	   4A" "
B,!BBc                     t                                                       i | _        g | _        g | _        | j        | _        t          j        d          j	        | _
        i | _        d S )NFdelete)r  setUpspecial_return_code_checksskip_return_code_checks	processesrx  r'  r]  NamedTemporaryFiler_  	file_namepid_to_pipern  r  s    r5   r  zMultiProcessTestCase.setUp  s_     13' .0$*	!4EBBBGr4   c                     t                                                       | j        D ]}|                                 g | _        d S r`   )r  tearDownr  	terminate)rn  pr  s     r5   r  zMultiProcessTestCase.tearDown  sF     	 	AKKMMMM
 r4   c                 \    |                                                      d          d         S r  idr   rm  s    r5   _current_test_namez'MultiProcessTestCase._current_test_name  s!    wwyys##B''r4   c                    g | _         t          t          | j                            D ]}t          j                                        \  }} || j        j        dt          |          z   || 
                                | j        |fdt          | dd          i          }|                                 t                              d||j                   || j        |j        <   | j                             |           d S )Nprocess fake_pgF)targetr_  ro   rp   Started process %s with pid %s)r  r5  r0   rq   rm   multiprocessingPiper  _runr2   r  r  r  startloggerinfopidr  r8  )rn  procr'  parent_conn
child_connprocesss         r5   _start_processesz%MultiProcessTestCase._start_processes  s    #do..// 	+ 	+D&+&;&@&@&B&B#Kd~*#d))+D3355t~zRwtY>>	  G MMOOOKK8$LLL,7DW[)N!!'****	+ 	+r4   c                     	 t           j                            d           n# t          $ r Y nw xY wt           j                            d          j        }|                     |           d S )Nspawn)rm   r  set_start_methodRuntimeErrorget_contextProcessr  )rn  r  s     r5   _spawn_processesz%MultiProcessTestCase._spawn_processes  sv    	!227;;;; 	 	 	D	 $0099Ad#####s   " 
//c                       e Zd ZdZdS )MultiProcessTestCase.Eventr   N)r-   r.   r/   GET_TRACEBACKr3   r4   r5   Eventr    s        r4   r  r'  c                    t                               d|           	 t          j                            | |g          }| |v r| j        rt                               d|           d S |                                 }t                               d||           |t          j	        j
        k    rt          j        d          5 }t          j        |           |                                 |                    d           |                     |                                           t                               d|           d d d            n# 1 swxY w Y   ||v rd S C)	Nz*Starting event listener thread for rank %sTz:Pipe closed for process %s, stopping event listener threadzReceived event %s on process %szr+)moder   zProcess %s sent traceback)r  debugr  
connectionwaitclosedrecvr  rj  r  r  r]  r  faulthandlerdump_tracebackflushseeksendread)parent_pipesignal_piper'  ready_pipeseventtmp_files         r5   _event_listenerz$MultiProcessTestCase._event_listener  s   A4HHH	)499;:TUUKk))% LLT   F#((**=udKKK06DDD!4$??? G8$3H=== ((( a(((#((999$?FFFG G G G G G G G G G G G G G G k))5	s   B EEE	test_namer  c                 d     | |          }||_         ||_        |                    ||           d S r`   )r'  r  run_testclsr'  r  r  r  rp   rn  s          r5   r  zMultiProcessTestCase._run  s9     s9~~	"i-----r4   c           	         t           j                            d          \  }}t          j        t
          j        ||| j        fd          }|                                 t          j
        dk    r/t          j
        dk    rt           j                            d           dt          j        d<   	  t          | |                       n# t           j        $ r]}t$                              d	| j        |t)          |                     t          j        t,          d
         j                   Y d }~nd }~wt0          $ r t$                              dt5          j                    | j        t
          j                   |                    t5          j                               t          j        t
          j                   Y nw xY w||                    d            |J |                                 |                                 nH# ||                    d            |J |                                 |                                 w xY w| j         r.	 tC          j"                     d S # tF          tH          f$ r Y d S w xY wd S )NF)duplexT)r  ro   daemonr   darwinr   TORCH_SHOW_CPP_STACKTRACESz4Process %s skipping test %s for following reason: %srK   z;Caught exception: 
%s exiting process %s with exit code: %s)%rm   r  r  	threadingThreadrj  r  r'  r  rh   r   _C'_set_print_stack_traces_on_fatal_signalrk   rl   r  unittestSkipTestr  r  r2   ri   rj   r+   	Exceptionr   	traceback
format_excTEST_ERROR_EXIT_CODEr  rb  closerq  r   destroy_process_groupAssertionErrorr  )rn  r  r  signal_recv_pipesignal_send_pipeevent_listener_threadses          r5   r  zMultiProcessTestCase.run_test  s   -2-B-G-Gu-G-U-U** ) 0'7/;!
 !
 !

 	##%%%<7""s|x'?'? H<<TBBB36
/0	 $GD)$$&&&&  	6 	6 	6KKF	B	   HZ	*455555555 		@ 		@ 		@LLN$&&	$9	   Y133444H)>?????		@  + %%d+++(444!&&(((  + %%d+++(444!&&((($ 	 *,,,,,"J/   	 	sQ   0C	 H 	GAD0+H 0BG>H  GH AII- -JJc                    g }t          | j                  D ]\  }}|j        | j        |j                 }	 |                    t          j        j                   |	                    ||f           [# t          $ r&}t                              d||           Y d }~d }~ww xY w|D ]\  }}	 |                    d          rT|j        rt                              d|           >|                                }t                              d||           nt                              d|           # t          $ r&}t                              d||           Y d }~d }~ww xY wd S )NzBEncountered error while trying to get traceback for process %s: %sr   z5Pipe closed for process %s, cannot retrieve tracebackz)Process %s timed out with traceback: 

%sz6Could not retrieve traceback for timed out process: %s)	enumerater  exitcoder  r  r  rj  r  r  r8  ConnectionErrorr  r   pollr  r  r  )rn  pipesrG  r  piper  r'  r  s           r5   _get_timedout_process_tracebackz4MultiProcessTestCase._get_timedout_process_tracebackT  s   #DN33 	 	JAw''4II28FGGGLL!T++++&   LL\        (   	 	JD$99Q<< { !S    ! $		ILLEtY    LLPRV   #   X       '	 	s6   ;A22
B"<BB".7D3&AD33
E#=EE#c                    t          |                                           }t          j                    }d}	 	 t          | j                  D ]p\  }}|j        t          j        k    rVt          d| d|j         d           t          j
                                        }|D ]}|                                 d} nq|rnt          d | j        D                       rnxt          j                    |z
  }	|	|k    rF|                                  t          d| d           | j        D ]}|                                 nt          j        d	           t          j                    |z
  }
|                     ||
           | j                                        D ]}|                                 d S # | j                                        D ]}|                                 w xY w)
NFTProcess z terminated with exit code z", terminating remaining processes.c              3   (   K   | ]}|j         d uV  d S r`   )r  )r   r  s     r5   r   z7MultiProcessTestCase._join_processes.<locals>.<genexpr>  s)      FF!qz-FFFFFFr4   zTiming out after z" seconds and killing subprocesses.g?)r  r  timer  r  r  rj  r  printrm   r  active_childrenr  allr  sleep_check_return_codesr  r=  r  )rn  rA  r  
start_timesubprocess_errorrG  r  r  acelapsedelapsed_timer  s               r5   ry  z$MultiProcessTestCase._join_processes~  s   dggii((Y[[
 &	 %dn55  DAq z%9%NNNsqssQZsss   +0*?*O*O*Q*Q"1 + +BLLNNNN+/( O $ FFt~FFFFF )++
2W$$88:::WGWWW   "^ & &
3= @  9;;3L$$R666 (//11  

 (//11  

s   EF6 62G(c           
      j   | j         st                              d           dS | j         d         }d t          | j                   D             }|rVd}|D ]B\  }}| j        |j                                                 }|d| dt          j         d| d	z  }Ct          |          t          | j                   D ]"\  }}	|	j
        t          d| d
| d          #|| j        v rdS t                                          D ]h}
|j
        |
j        k    rVt          r6t                              d|                                 |
j                    dS t'          j        |
j                  id}|| j        v r| j        |         }|                     |j
        |d| d|j
         d|j                    dS )z
        Checks that the return codes of all spawned processes match, and skips
        tests if they returned a return code indicating a skipping condition.
        z<Note: no subprocesses were spawned, test was likely skipped.Nr   c                 F    g | ]\  }}|j         t          j        k    ||fS r3   )r  rj  r  )r   rG  r  s      r5   r1  z<MultiProcessTestCase._check_return_codes.<locals>.<listcomp>  s;     
 
 
1z1FFF FFFFr4    r  z exited with error code z and exception:

 terminated or timed out after  seconds6Skipping %s on sandcastle for the following reason: %szExpected exit code z	 but got z
 for pid: )r   )r  r  warningr  r  r  r  rj  r  r  r  r  rj   r=  r+   r   r  r  r,   r  r  r  assertEqual)rn  rA  r  first_processerrored_processesr   rG  r  error_messager  skipexpected_return_codes               r5   r   z(MultiProcessTestCase._check_return_codes  sP    ~ 	NNN   Fq)
 
!$.11
 
 

  
	&E/  
7 $ 0 = B B D D9q 9 9:N:c 9 9'49 9 9
 u%%% dn-- 	 	DAqz!"WqWWWWW   " ---F%%'' 	: 	:D%77  :
 KKP		  
 FF"+DL999 8   ! 000#'#B2#F " z&:zz]E[zzgtgxzz 	 	
 	
 	
 	
 	
r4   c                     | j         dk    S )Nr   r'  rm  s    r5   r  zMultiProcessTestCase.is_master  s    yA~r4   r  r  r   N)r-   r.   r/   rx  r  boolro  propertyrq  r0   rq   r  r2   r  r  r  r  r  r  r   r  staticmethodr  classmethodr  r  r  ry  r   r  __classcell__r  s   @r5   rj  rj    s       
      d    X "C " " " X"/ / / ?H 8;	     &     "     (C ( ( ( (+ + + +"$ $ $ $            \< ..#&.36.	. . . [.5# 5t 5 5 5 5n( ( ( (T* * * *XJ
 J
 J
 J
X 4    X    r4   rj  c                   >     e Zd Z fdZd ZdefdZddZd Z xZ	S )	DistributedTestBasec                     t                                                       t          | j                  t          j        d<   |                                  d S )Nrf   )r  r  r2   rq   rk   rl   r  r  s    r5   r  zDistributedTestBase.setUp  sB    #&t#7#7
< r4   c                     	 t           j                                         n# t          $ r Y nw xY w	 t	          j        | j                   d S # t          $ r Y d S w xY wr`   )rm   distributedr  r  rk   remover  OSErrorrm  s    r5   r  zDistributedTestBase.tearDown  s~    	335555 	 	 	D		Idn%%%%% 	 	 	DD	s   ! 
..A 
AAr   c                 *    d|v rdS d|v rdS d|v rdS dS )Nr'   r$   r[   r&   r(   r%   rV   r3   )rn  rb   s     r5   r   zDistributedTestBase.backend  s3    V6f__6f__66r4   Nc                 
   || j         }t          j        |                                          }t          j                            | j        |          }t          j                            |                     |          || j	        |           d|                     |          v sd|                     |          v r$t          j
                            | j	                   t          j        j                                        S )Nr   rq   r'  storer$   r%   )rq   rm   get_device_modulern   r"  	FileStorer  init_process_groupr   r'  acceleratorset_device_indexdistributed_c10d_get_default_group)rn  rb   rq   num_visible_devicesr(  s        r5   	create_pgzDistributedTestBase.create_pg  s    J#5f==JJLL!++DN<OPP,,LL((!	 	- 	
 	
 	
 T\\&))))Vt||F7K7K-K-K..ty999 1DDFFFr4   c                     t          j        |                                          fdt          | j                  D             S )Nc                     i | ]	}||z  g
S r3   r3   )r   rG  r0  s     r5   rQ  z6DistributedTestBase.rank_to_device.<locals>.<dictcomp>+  s$    MMMA++,MMMr4   )rm   r)  rn   r5  rq   )rn  rb   r0  s     @r5   rank_to_devicez"DistributedTestBase.rank_to_device)  sD    #5f==JJLLMMMMeDO6L6LMMMMr4   r`   )
r-   r.   r/   r  r  r2   r   r1  r4  r  r  s   @r5   r  r     s                 
      G G G GN N N N N N Nr4   r  subtest_configtest_fntest_kwargsc                    t          |                                          }d |D             }d |D             }t          j        | D ]}t	          t          ||                    }	 | j        di |	5  t          j        	                                  ||i ||	 t          j        	                                 ddd           n# 1 swxY w Y   t          j                     dS )a\  
    Runs a test function given by ``test_fn`` as a subtest according to the
    configurations specified by ``subtest_config``. This amortizes the
    costly setup overhead (including process spawn and initializing the
    process group) over the subtests.

    Args:
        subtest_config (Dict[str, List[Any]]): A mapping from subtest
            keyword argument name to a list of its possible values.
        test_fn (Callable): A callable that runs the actual test.
        test_args: Positional arguments to pass to ``test_fn``.
        test_kwargs: Keyword arguments to pass to ``test_fn``.
    c                     g | ]
}|d          S )r   r3   r   items     r5   r1  z run_subtests.<locals>.<listcomp>D  s    %O%O%O$d1g%O%O%Or4   c                     g | ]
}|d          S rM  r3   r:  s     r5   r1  z run_subtests.<locals>.<listcomp>E  s    -W-W-W$d1g-W-W-Wr4   Nr3   )rN  items	itertoolsproductdictzipsubTestrm   _dynamoresetr   r   )
cls_instr5  r6  	test_argsr7  subtest_config_itemssubtest_config_keyssubtest_config_valuesr=  subtest_kwargss
             r5   run_subtestsrK  .  sA   * 9=^=Q=Q=S=S8T8T%O%O:N%O%O%O-W-WBV-W-W-W#%:;  c"5v>>??X//// 	" 	"M!!!GY@+@@@@M!!!	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	 s   4ACC	C	c                      t           t           S 	 t          j        g dd          j        dk    a n# t          $ r da Y nw xY wt           S )a   
    If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
    Libfabric EFA interfaces and EFA software components installed,
    see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
    N)fi_infoz-pefaz-t	FI_EP_RDMF)checkr   )EFA_PROBE_RESULT
subprocessrun
returncodeFileNotFoundErrorr3   r4   r5   has_efarV  T  sv     #!N;;;5   	  ! ! ! !s   !2 A Ac                  *    t                      rddgndS )a  
    If the machine has Libfabric EFA interfaces and EFA software components installed it may cause
    'RuntimeError: In operator() at tensorpipe/common/ibv.h:172 "": Operation not supported' if tensorpipe
    uses InfiniBand transport, so we exclude it from tensorpipe transports,
    see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
    shmuvN)rV  r3   r4   r5   tp_transportsrZ  j  s     $II/E4==4/r4   c                 |      t          t          |          S d t                      fd            }|S )z+
    Wrapper to use with a test method
    N)r  rq   c                     t                      t          j                    }fd fd}g }t                     D ]D}t	          j        |||f          }|                                 |                    |           E|S )Nc                  .     t           j        j        k    S r`   r   r.  _worldworlds   r5   world_is_validzaspawn_threads_and_init_comms.<locals>._run_test_method_with_multi_threads.<locals>.world_is_valid      D1888r4   c                    t          j        d| |           	               n]# t          $ rP}t          j                            | t          j                    f           t          j	        |           Y d }~nd }~ww xY w             rt          j
                     d S d S #              rt          j
                     w w xY w)Nthreadedr   r'  rq   r(  )r   r+  BaseExceptionMultiThreadedTestCaseexception_queueputrh   exc_infor"   exception_handler  )r'  world_pgr(  excallbackrb  rq   s       r5   workerzYspawn_threads_and_init_comms.<locals>._run_test_method_with_multi_threads.<locals>.worker  s   #"*E   
1



    %5994:PQQQ!2        ">## 1.000001 1>>## 1.00001s,   
& B$ 
B AA;6B$ ;B  B$ $ Cr  ro   )r    r   	HashStorer5  r  r  r  r8  )	rq   ro  global_storerp  threadsr'  tra  rb  s	   ``     @@r5   #_run_test_method_with_multi_threadszIspawn_threads_and_init_comms.<locals>._run_test_method_with_multi_threads  s    $&&~''	9 	9 	9 	9 	9	1 	1 	1 	1 	1 	1 	1  *%% 	 	D dE<5PQQQAGGIIINN1r4   c                 L    t           j        j                            d           	   fd          }t                              |           t           j        j                            d           d S # t           j        j                            d           w xY w)NTc                       g R i S r`   r3   )ro   rr   rp   rn  s   r5   r   z?spawn_threads_and_init_comms.<locals>.wrapper.<locals>.<lambda>  s!    DD$?$?$?$?$?$? r4   F)rm   r  _distributed_c10d_set_thread_isolation_moderh  _join_threads)rn  ro   rp   rt  rv  rr   rq   s   ``` r5   rs   z-spawn_threads_and_init_comms.<locals>.wrapper  s     	"==dCCC	I99??????? G "//>>>H&AA%HHHHHEH&AA%HHHHs   -A= =&B#)r   spawn_threads_and_init_commsr	   )rr   r  rq   rs   rv  s   ` ` @r5   r|  r|  t  s|     |('j
 
 
 	
  > 4[[
I 
I 
I 
I 
I 
I [
I Nr4   c                        e Zd ZdZ ej                    ZdZd Z	 dde	de	ddf fd	Z
d
 Zd Zd fdZ fdZd Zed             Zd Zed             Zed             Zedefd            Zede	fd            ZddddZddddZ xZS )rh  a5  
    Test runner that runs all tests with the in-proc process group using
    multiple threads with the threaded process group.

    Each test spawns world_size threads and run the test method in each thread.

    Difference from regular MultiProcess test runner:
    Must explicitly defines SetUp and call self._spawn_threads() to run the tests.
    Cannot use setUp / tearDown (must use perThreadSetup / perThreadShutdown)
        to set up / tear down each thread when running each test.
    No global state possible
        How bad of a limitation is this?
    r   c                 b    t                    fd            }t          j        ||           S )Nc                 v    | j         | j        k    r|                     | j                   d S               d S r`   )r'  MAIN_THREAD_RANKr{  rt  rz  s    r5   rs   z2MultiThreadedTestCase.join_or_run.<locals>.wrapper  s@    yD111""4<44444r4   r{  r~  s    ` r5   r  z!MultiThreadedTestCase.join_or_run  r  r4   r  r  r  r   Nc                 6   |dk    r|}t                                          |           	 t          | |          }t          | ||                     |                     d S # t
          $ r,}|dk    rt          d| j         d|           |Y d }~d S d }~ww xY wr  r  r  s        r5   r  zMultiThreadedTestCase.__init__  r  r  c                     d S r`   r3   rm  s    r5   perThreadSetUpz$MultiThreadedTestCase.perThreadSetUp  s    r4   c                     d S r`   r3   rm  s    r5   perThreadTearDownz'MultiThreadedTestCase.perThreadTearDown  s    r4   c                     t                                                       | j        | _        g | _        dt
          j        d<   dS )z
        setUp only set up things in the main thread, if you want to configure things
        in the spawned threads, use perThreadSetUp
        r   r  N)r  r  r  r'  rt  rk   rl   r  s    r5   r  zMultiThreadedTestCase.setUp  s:    
 	)	36
/000r4   c                 V    t                                                       g | _        dS )z
        tearDown only set up things in the main thread, if you want to configure things
        in the spawned threads, use perThreadTearDown
        N)r  r  rt  r  s    r5   r  zMultiThreadedTestCase.tearDown  s&    
 	r4   c                    t           j        j                            d           | j        }t                      t          j                    | j        _	        fd} |            st          d          t          | j                  D ]X}t          j        | j        j        ||| j        f          }|                                 | j                            |           YdS )zk
        class method to spawn threads and run test, use this method in the SetUp of your TestCase
        Tc                  .     t           j        j        k    S r`   r^  r`  s   r5   rb  z<MultiThreadedTestCase._spawn_threads.<locals>.world_is_valid  rc  r4   zInvalid worldrq  N)rm   r  ry  rz  r  r    r   rr  r  rs  r  r5  rq   r  r  r  r  rt  r8  )rn  r  rb  r'  ru  ra  s        @r5   _spawn_threadsz$MultiThreadedTestCase._spawn_threads  s     	"==dCCC+	$&&&*n&6&6#	9 	9 	9 	9 	9 ~ 	0///$/** 	# 	#D ~*)T4?1S  A GGIIIL""""	# 	#r4   c                      | |          }||_         t          |d          rDt          j                    |_        t
          j        |j        _        t
          j        |j        _	        |
                    |||           d S )N_tls)r'  hasattrr  localr  r   
_precision	precision_rel_tolrel_tolrun_test_with_threaded_pg)r  r  r'  rq   rp   rn  s         r5   r  zMultiThreadedTestCase._run  sq    s9~~	 4   	2!))DI"*"5DI ( 1DI&&y$
CCCCCr4   c                    t          j        d||| j        j                   |                                  	  t          | |                       nX# t          $ rK}| j                            |t          j
                    f           t          j        |           Y d}~nd}~ww xY wt          j                     |                                  dS # t          j                     |                                  w xY w)zd
        Run the current test associated with `test_name` using the threaded process group.
        re  rf  N)r   r+  r  rs  r  r  rg  ri  rj  rh   rk  r"   rl  r  r  )rn  r  r'  rq   rn  s        r5   r  z/MultiThreadedTestCase.run_test_with_threaded_pg  s(    	!.-		
 	
 	
 	
 			%$GD)$$&&&& 	 	 	 $$dCLNN%;<<<.       	 &(((""$$$$$ &(((""$$$$s0   A C 
B&AB!C !B&&C )C;c           
         t           }	 t          |          D ]v\  }}|                    t          d|                     |                                r:t
          j                            |t          t          d| d          d ff           wt          j
                     g }| j                                        sG| j                                        }|                    |           | j                                        Gt                       t          j        j                            d           n7# t                       t          j        j                            d           w xY w|                     |||           d S )Nr   zRank failed to join in under r  F)r  r  rb  maxis_aliverh  ri  rj  TimeoutErrorr"   rD  emptyr   r8  r!   rm   r  ry  rz  r   )r  rt  rA  r  idxthreadfailed_ranksfailures           r5   r{  z#MultiThreadedTestCase._join_threads6  s   !	I(11  VC7OO,,,??$$ )9== , ,$UG$U$U$U!" !" !%	   #%%%L)//11 --1133##G,,, )//11 - #$$$H&AA%HHHH #$$$H&AA%HHHHgr:::::s   C;D7 74E+c           	         d}d}|D ]Q\  }}|d         }t          |t          j                  rCt                              d||t          |                     |dk     rt          d         j        }kt          |t                    r2d| d| d	}	t          	                    |	           t          |	          t          |t                    rKd                    t          j        |           }	t          	                    d
|	|           |d| d|	 dz  }t          |t                    r*t!          |j                  t$          k    r|dk     r|j        }St'          |          dk    rt          |          |dk    rkt                                          D ]S}
||
j        k    rDt*          r$t                              d||
j                    d S t          j        |
j                  Rd S d S )Nr  r   r   z3Thread %s skipping test %s for following reason: %sr   rK   zThread r
  z	 seconds
z'Caught exception: 
%s exiting thread %sz exited with exception:
r	  r  )
isinstancer  r  r  r  r2   rj   r+   r  r   r  r  rb  r  format_exception
SystemExitr   coder0   lenr=  r   r,   )r  r  r  rA  	error_msg	skip_coder'  rk  excr   r  s              r5   r   z)MultiThreadedTestCase._check_return_codesT  s    		* 	) 	)ND(1+C#x011 )IHH	   q== *9 5 ?IC.. 
)XXXWXXXS!!!"3'''C++ )ggi8(CDDGdSSSMtMMcMMMM		C,, )>>S((Y]] #I y>>Ay)))q=="))++ > >..$ 	>T L  
 &/=== / => >r4   c                     t           S r`   rs  rm  s    r5   rq   z MultiThreadedTestCase.world_size  ru  r4   c                 \    |                                                      d          d         S r  r  rm  s    r5   r  z(MultiThreadedTestCase._current_test_name  s#     wwyys##B''r4   r   r  c                N    | j         |k    r|                     |||           dS dS )z
        The reason why we have this util function instead of
        self.assertEqual is all threads are sharing one CPU RNG
        so the assertion result is only reliable on rank 0
        N)r'  r  rn  r   yr   r'  s        r5   assertEqualOnRankz'MultiThreadedTestCase.assertEqualOnRank  s7     9Q3''''' r4   c                L    | j         |k    r|                     ||           d S d S r`   )r'  assertNotEqualr  s        r5   assertNotEqualOnRankz*MultiThreadedTestCase.assertNotEqualOnRank  s3    91%%%%% r4   r  r  r`   )r-   r.   r/   __doc__queueQueueri  r  r  r2   r  r  r  r  r  r  r  r  r  r{  r   r  r0   rq   r  r  r  r  r  s   @r5   rh  rh    s         "ekmmO/ / / ?H 8;	     &    	7 	7 	7 	7 	7 	7    # # #. D D [D% % %. ; ; [;: /> /> [/>b "C " " " X" (C ( ( ( X(( ( ( ( ( (&1 & & & & & & & & &r4   rh  c                   t     e Zd Zdeej        ej        f         deddf fdZ	dej        dej        fdZ
 xZS )SaveForwardInputsModuleforward_inputscast_forward_inputsr   Nc                     t                                                       t          j        dd          | _        || _        || _        d S )Nd   )r  r  nnLinearlr  r  rn  r  r  r  s      r5   r  z SaveForwardInputsModule.__init__  sD    
 	3$$,#6   r4   r   c                     || j         | <   |                     | j        r$|                    | j        j        j                  n|          S r`   )r  r  r  toweightdtypern  r   s     r5   forwardzSaveForwardInputsModule.forward  sB    $%D!vv43KRadd46=.///QRSSSr4   r-   r.   r/   r@  r  Modulerm   Tensorr  r  r  r  r  s   @r5   r  r    s        7RY457 "7 
	7 7 7 7 7 7T T%, T T T T T T T Tr4   r  c                   t     e Zd Zdeej        ej        f         deddf fdZ	dej        dej        fdZ
 xZS )SaveForwardInputsModelr  r  r   Nc                     t                                                       t          ||          | _        t          ||          | _        || _        d S r`   )r  r  r  c1c2r  r  s      r5   r  zSaveForwardInputsModel.__init__  sM    
 	).:MNN).:MNN,r4   r   c                 f    || j         | <   |                     |                     |                    S r`   )r  r  r  r  s     r5   r  zSaveForwardInputsModel.forward  s+    $%D!wwtwwqzz"""r4   r  r  s   @r5   r  r    s        -RY45- "- 
	- - - - - -# #%, # # # # # # # #r4   r  c              #     K   |st           j                            |            t           j                                        x}r|j        nd}|t          j        |          }dt          j        d<   dt          j        d<   |r_|rFt           j	        j
        j        j                                        }t          j        d|| |           nt          j        || |           t           j                                         t           j        j        j                                         	 d V  t           j                                         t           j        j        j                                         |rt          j                     d S d S # t           j                                         t           j        j        j                                         |rt          j                     w w xY w)	Ncpur   MASTER_ADDR6789MASTER_PORTfaker'  )r   r'  rq   )rm   r,  r-  current_acceleratorr   r   get_default_backend_for_devicerk   rl   testing	_internalr"  r  	FakeStorer+  rC  rD  utilscountersclearr  )r'  rq   r   init_pgr  accdevice_typer(  s           r5   _dynamo_dist_per_rank_initr    s       1**4000 "-AACCCSO%  5kBB +BJ} &BJ} 
W 		WM+7?IIKKE#%	     #G$:VVVV	M	M &&((()$**,,, 	)&(((((	) 	) 	$**,,, 	)&((((	)s   )F AG*c                   L     e Zd ZdZe fd            Ze fd            Z xZS )#DynamoDistributedSingleProcTestCasez
    Test harness for single-process dynamo distributed tests,
    initializes dist process group.

    Prefer this for simple tests, as it's easier to debug.
    c                    t                                                       | j                            t	          j        t          j        ddd                     d| _        t          j
                                        j        }| d| j         | _        || j        v rd n| j        g| _        t          j        t          j        |          | j        d           d S )Nr   12355)r  r  r   :r   )r'  rq   )r  
setUpClass_exit_stackenter_contextr   r@  rk   rl   r'  rm   r,  r  r   rb   
device_idsr   r+  r  )r  rb   r  s     r5   r  z.DynamoDistributedSingleProcTestCase.setUpClass  s    %%J
#.#*  	
 	
 	
 "6688=++++
!'3:!5!5CH:/77chST	
 	
 	
 	
 	
 	
r4   c                 n    t          j                     t                                                       d S r`   )r   r  r  tearDownClass)r  r  s    r5   r  z1DynamoDistributedSingleProcTestCase.tearDownClass  s.    "$$$r4   )r-   r.   r/   r  r  r  r  r  r  s   @r5   r  r    ss          
 
 
 
 [
(         [         r4   r  c            	       V    e Zd ZdZedefd            Zededededdfd            Z	dS )	"DynamoDistributedMultiProcTestCasea   
    Use this for tests that actually run on multiple GPUs.

    Decorate tests with @skip_if_lt_x_gpu(ngpu)

    Note: MultiProcTestCase spawns processes per test and is slow.
    Prefer MultiThreadedTestCase for most tests. Perhaps use this one
    sparingly for integration tests.
    r   c                 >    t           j                                        S r`   )rm   r,  rn   rm  s    r5   rq   z-DynamoDistributedMultiProcTestCase.world_size  s     --///r4   r'  r  r  Nc                     t          j        t          j                                | |          }||_        ||_        |                    ||           d S r`   )r   
addHandlerloggingNullHandlerr'  r  r  r  s          r5   r  z'DynamoDistributedMultiProcTestCase._run  sU     	W022333 s9~~	"i-----r4   )
r-   r.   r/   r  r  r0   rq   r  r2   r  r3   r4   r5   r  r    s          0C 0 0 0 X0 	.	.#&	.36	.		. 	. 	. [	. 	. 	.r4   r  c                       e Zd ZU dZdZeed<   dZeed<   dZe	e
         ed<    ed          Zeed	<   d
Zeed<   ede	e
         fd            Zede
fd            Zedd            Zed             Zede
ddfd            Zed             Zedd            Ze fd            Ze fd            Zd fdZd Z	 dde
de
ddf fdZ xZS ) MultiProcContinuousTestr   rq   r'  N	rdvz_filex   )secondsr  Fpoison_pillr   c                     dS )z
        ProcessGroup backend str.
        To be customized by sub test classes, e.g. "nccl".
        Otherwise we return None -- lazily decided by tensor.
        Nr3   )r  s    r5   backend_strz#MultiProcContinuousTest.backend_str1  	     tr4   c                 T    t           j                                        }|dS |j        S )Nr  )rm   r,  r  r   )r  curr_devices     r5   r  z#MultiProcContinuousTest.device_type;  s*    ';;==5r4   c                     dS )z
        ProcessGroup init options.
        To be customized by sub test classes, e.g. ProcessGroupNCCLOpTest
        Here we return None.
        Nr3   )r  high_priority_streams     r5   optszMultiProcContinuousTest.optsB  r  r4   c                 <   |J t          |          t          j        d<   t          j        ||          }t          j        |                                 ||||                                 | j                   t          j	        
                                | _        d S )N
LOCAL_RANK)r   rq   r'  r(  
pg_optionsr  )r2   rk   rl   r   r*  r+  r  r  r  r.  r/  pg)r  r'  rq   r  r(  s        r5   _init_pgz MultiProcContinuousTest._init_pgK  s    $$$ $'t99
< y*55OO%%!xxzzK	
 	
 	
 	
 &99;;r4   r  c                     |                     dd          d         } | |          }| j        |_        | j        |_        t          ||          } |di | d S )Nr  r   )maxsplitr   r3   )rsplitr'  rq   r  )r  r  rp   r  rn  r6  s         r5   _run_test_given_idz*MultiProcContinuousTest._run_test_given_id^  sh     NN3N33B7	s9~~H	.$	**&r4   c                    d}d|cxk    r|k     sn J || _         || _        |                     |||           t                              d           	 |                                }t                              d|            |n	 |                     |           |                    |           n{# t          $ rn}d}t          j
                    }	d                    t          j        |	           }
t          d|
           }||_        |                    |           Y d }~nd }~ww xY wt                              d           |st!          j                     d S d S )	NFr   zSetup completeTz	Got test r  zException in worker process:
zTerminating ...)r'  rq   r  r  r  r   r  rj  rg  rh   rk  rb  r  r  r  	__cause__r   r  )r  r'  rq   r  
task_queuecompletion_queueraised_exceptionr  rn  rk  tb_strenhanced_exs               r5   _worker_loopz$MultiProcContinuousTest._worker_loopj  s    D%%%%:%%%%%%# 	T:y111 	%&&&	2 nn&&GLL.W..///2&&w/// $$W----  2 2 2#' <>>!;X!FGG*+TF+T+TUU(*% $$[111111112	2, 	&'''
   	)&(((((	) 	)s   	*B4 4
D,>A$D''D,c                    g | _         g | _        g | _        t          j        d          j        | _        	 t          j        	                    d           n# t          $ r Y nw xY wt          t          |                    D ]}t          j                                        }t          j                                        }t          j                            | j        dt!          |          z   d||| j        ||f          }|                                 | j                             |           | j                            |           | j                            |           t&                              d||j                   d S )NFr  r  r  T)r  r_  r  ro   r  )r  task_queuescompletion_queuesr]  r  r_  r  rm   r  r  r  r5  r0   r  r  r  r2   r  r8  r  r  r  )r  rq   r'  r
  r  r  s         r5   r  z(MultiProcContinuousTest._spawn_processes  sy    " 35AAAF	!227;;;; 	 	 	D	 #j//** 	N 	ND.4466J$4::<<+33'#d))+JzCST	 4  G MMOOOM  )))O"":...!(()9:::LL94MMMM	N 	Ns   A 
A#"A#c                    t                                                       |                                 }| j        dk    rNt	          j        |                                          | _        | j        dk    rt          j        d| d          t          
                    d| j         d| j         d|            |                     | j                   dS )	z
        Class-scope test fixture. Run once for entire test class, before any test starts.
        Set up the process group.
        r  r   zNo z devices availablezTesting class z on  N)r  r  r  rq   rm   r)  rn   r  r  r  r  r-   r  )r  r  r  s     r5   r  z"MultiProcContinuousTest.setUpClass  s     	 oo''>R"4[AANNPPCN~""'(Mk(M(M(MNNNMS\MMs~MMMM	
 	
 	
 	S^,,,,,r4   c                    t                               d| j         d           | j        D ]}|                    d           | j        D ]}|                                 	 t          j        | j	                   n# t          $ r Y nw xY wt                               d| j         d           t                                                       dS )z
        Class-scope test fixture. Run once for entire test class, after all tests finish.
        Tear down the process group.
        zJoining z workersNzClass z	 finished)r  r  rq   r  rj  r  rb  rk   r#  r  r$  r  r-   r  r  )r  r
  r  r  s      r5   r  z%MultiProcContinuousTest.tearDownClass  s     	8888999/ 	! 	!JNN4     } 	 	GLLNNNN	Icm$$$$ 	 	 	D	 	4S\444555s   #A= =
B
	B
c                    t                                                       | j        | _        | j        j        r)t          j        d|                                            t          | j
                  D ]^\  }}t                              d| d|                                             |                    |                                            _dS )z5
        Test fixture. Run before each test.
        zPrevious test failed, skipping zSending Rank r  N)r  r  rx  r'  r  r  r  r  r  r  r  r  r  rj  )rn  rG  r
  r  s      r5   r  zMultiProcContinuousTest.setUp  s     	 *	 >% 	S#$Qdggii$Q$QRRR 't'788 	& 	&MAzLL999dggii99:::NN47799%%%%	& 	&r4   c                 b    t                    fd            }t          j        ||           S )Nc           	      X   | j         | j        k    rt                              d|                                             t          | j                  D ]\  }}|                                }t          |t                    rMt          
                    d| d|                                  d| j        j                    d| j        _        |||                                 k    sJ t                              d| d|                                             d S               d S )NzWaiting for workers to finish zDetected failure from Rank z in: z(, skipping rest of tests in Test class: TzMain proc detected rank z
 finished )r'  rx  r  r  r  r  r  r   r  rg  r  r  r-   r  )rn  rG  r  rvrA  s       r5   rs   z>MultiProcContinuousTest._worker_run_main_wait.<locals>.wrapper  sF   yD222IdggiiIIJJJ+4T5K+L+L  'A')--//B!"m44 	!_! _ _$'')) _ _EI^E\_ _   6:2  ????LLK1KK		KK    ( r4   r{  r~  s    ` r5   _worker_run_main_waitz-MultiProcContinuousTest._worker_run_main_wait  sA    	r	 	 	 	 
	4 ...r4   r  r  r  c                 6   |dk    r|}t                                          |           	 t          | |          }t          | ||                     |                     d S # t
          $ r,}|dk    rt          d| j         d|           |Y d }~d S d }~ww xY wr  )r  r  r  r  r  r  r  r  r  s        r5   r  z MultiProcContinuousTest.__init__  s    
 ""$K%%%		{++BD+t'A'A"'E'EFFFFF 	 	 	Y&& !LdnLL
LL  '&&&&&	r  )Fr  r  )r-   r.   r/   rx  rq   r0   r1   r'  r  r   r2   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  s   @r5   r  r  #  s7        JD#NNN#Ix}###"3///GY///KHSM    [  C       [     [ < < [<$ 	 	4 	 	 	 [	 +) +) [+)Z N N N [N< - - - - [-*         [ .& & & & & &$/ / /F ?H 8;	         r4   r  r`   r   rM  r  r   )r  r>  r  r  rB  rk   r  rR  rh   r]  r  r  r  r|  r  
contextlibr   dataclassesr   datetimer   enumr   	functoolsr   r   r	   ior
   typingr   r   r   r   r   unittest.mockr   rm   torch._dynamo.test_casetorch.cuda.nccltorch.distributedr"  r   torch.nnr  torch._C._autogradr   torch._C._distributed_c10dr   torch._logging._internalr   $torch.testing._internal.common_utilsr   r   r   r   r   r   r   r   r   r   r   r   5torch.testing._internal.distributed.multi_threaded_pgr    r!   r"   	getLoggerr-   r  setLevelINFOr   ra   HAS_ACCELERATORr*   rj   rR   rc   rv   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rb   r0   r  r   r  r  getenvr  r  r  r&  rJ  r2   rT  rU  r^  r1   rd  rg  rt  rj  r  r@  rN  rK  rQ  rV  rZ  r|  rh  r  r  r  r  rC  	test_caser  r  r  r3   r4   r5   <module>r3     sv
                  				      



             % % % % % % ! ! ! ! ! !             , , , , , , , , , ,       = = = = = = = = = = = = = =                                  ) ) ) ) ) ) 7 7 7 7 7 7 . . . . . .                                     
	8	$	$    444 E? 3x38    z   
88
C  xx$FGG	
 XXb"BCC xx455 88B =>> 88B >?? 88B >?? 88B >?? 88B >?? 88B >?? 88B >?? 88B >?? HHR>??  ((2677!" hhr#LMM#$ xx
V %* 88B DEE+, hhr#BCC- 
4 * * * * * * * **& & &  6             $    $+ + +\  4    .         :  
 
 
  F Fc F# F$ F F F F" 	Ia   
 
 
 
:  QOOc)")$GOOPPO,c2   /+.'(
 
T 
 
 
 
IC I I I I 2 2 2( (S (c (s ( ( ( (XS 3    2 26(-	. 5 5 5
 
Xc] 
d 
 
 
 
"     y y y y y8 y y yB+N +N +N +N +N. +N +N +N\d3i( 
    F      ,0 0 0 
3E7 7 7 7tl& l& l& l& l&H l& l& l&^T T T T Tbi T T T # # # # #RY # # #  :?#) #) #) #)L              %-*A*J         F. . . . .)< . . .8G G G G Gh G G G G Gr4   