
    `i                        d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 	 d dlmZ dZn# e$ r dZY nw xY wej        rmej        ej        ej        ej        ej        ej        ej        ej        ej        ej        ej        ej        ej        d	Zej        ej        ej        ej        d
Zni Zi ZddZ G d de          Z  G d d          Z!d Z"d Z# G d d          Z$dS )    N)nccl)_store)_Backend)sparse)MPITF)bBiIlLqQefdFD)sumprodmaxminc                     | j         j        }|t          vrt          d| j          d          t          |         }|| j        }|dv r|d|z  fS ||fS )NUnknown dtype 	 for NCCLFD   )dtypechar_nccl_dtypes	TypeErrorsize)arraycountr   
nccl_dtypes       p/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cupyx/distributed/_nccl_comm.py_get_nccl_dtype_and_countr'   .   sl    KEL  ????@@@e$J}
}}1u9$$u    c                        e Zd ZdZej        ej        df fd	Zd Zd Z	d Z
d Zd Zd	 ZddZddZddZ	 ddZddZddZddZddZddZddZddZd Z xZS )NCCLBackenda  Interface that uses NVIDIA's NCCL to perform communications.

    Args:
        n_devices (int): Total number of devices that will be used in the
            distributed execution.
        rank (int): Unique id of the GPU that the communicator is associated to
            its value needs to be `0 <= rank < n_devices`.
        host (str, optional): host address for the process rendezvous on
            initialization. Defaults to `"127.0.0.1"`.
        port (int, optional): port used for the process rendezvous on
            initialization. Defaults to `13333`.
        use_mpi(bool, optional): switch between MPI and use the included TCP
            server for initialization & synchronization. Defaults to `False`.
    Fc                     t                                          ||||           t          o|| _        | j        r|                     ||           d S |                     ||||           d S N)super__init___mpi_available_use_mpi_init_with_mpi_init_with_tcp_store)self	n_devicesrankhostportuse_mpi	__class__s         r&   r.   zNCCLBackend.__init__J   sv     	D$555&27= 	C	400000%%itTBBBBBr(   c                 D   t           j        | _        | j                                        | _        | j                                         d }| j        dk    rt          j                    }| j                            |d          }t          j	        |||          | _
        d S )Nr   root)r   
COMM_WORLD	_mpi_commGet_rank	_mpi_rankBarrierr   get_unique_idbcastNcclCommunicator_comm)r3   r4   r5   nccl_ids       r&   r1   zNCCLBackend._init_with_mpiT   s     0022   >Q(**G.&&wQ&77*9gtDD


r(   c                    d }|dk    rk| j                             ||           t          j                    }t	          d |D                       }|| j        d<   | j                                         n?| j                                         | j        d         }t          d |D                       }t          j        |||          | _	        d S )Nr   c                     g | ]}|d z   S     .0r   s     r&   
<listcomp>z4NCCLBackend._init_with_tcp_store.<locals>.<listcomp>i   s    $>$>$>QW$>$>$>r(   rF   c                 2    g | ]}t          |          d z
  S rI   )intrL   s     r&   rN   z4NCCLBackend._init_with_tcp_store.<locals>.<listcomp>o   s"    ;;;aSVVc\;;;r(   )
r   runr   rB   bytes_store_proxybarriertuplerD   rE   )r3   r4   r5   r6   r7   rF   shifted_nccl_ids          r&   r2   z NCCLBackend._init_with_tcp_storea   s    199KOOD$'''(**G $$>$>g$>$>$>??O+:Di(%%''''%%''''	2G;;7;;;<<G*9gtDD


r(   c                 X    |j         j        s|j         j        st          d          d S d S )Nz4NCCL requires arrays to be either c- or f-contiguous)flagsc_contiguousf_contiguousRuntimeError)r3   r#   s     r&   _check_contiguouszNCCLBackend._check_contiguousr   sM    {' 	H0H 	HFH H H	H 	H 	H 	Hr(   c                 Z    |#t           j        j                                        }|j        S r,   )cupycudastreamget_current_streamptr)r3   r`   s     r&   _get_streamzNCCLBackend._get_streamw   s%    >Y%88::Fzr(   c                     |t           vrt          d| d          |dv r|dk    rt          d          t           |         S )NzUnknown op r   r   r   z-Only nccl.SUM is supported for complex arrays)	_nccl_opsr[   
ValueError)r3   opr   s      r&   _get_opzNCCLBackend._get_op|   sZ    Y:R:::;;;D==R5[[?A A A}r(   c                    t           }t          |d         t          t          f          r t	          j        |d         d                   st	          j        |d                   rt          } t          ||          | g|R   d S Nr   )_DenseNCCLCommunicator
isinstancelistrU   r   issparse_SparseNCCLCommunicatorgetattr)r3   functionargs
comm_classs       r&   _dispatch_arg_typezNCCLBackend._dispatch_arg_type   s    +
Q$//	1a,,	1 tAw''	1
 1J%
H%%d2T222222r(   r   Nc                 :    |                      d||||f           dS )a  Performs an all reduce operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
            op (str): reduction operation, can be one of
                ('sum', 'prod', 'min' 'max'), arrays of complex type only
                support `'sum'`. Defaults to `'sum'`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        
all_reduceNrt   )r3   in_array	out_arrayrg   r`   s        r&   rv   zNCCLBackend.all_reduce   s4     	8YF;	= 	= 	= 	= 	=r(   r   c                 <    |                      d|||||f           dS )a  Performs a reduce operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
                will only be modified by the `root` process.
            root (int, optional): rank of the process that will perform the
                reduction. Defaults to `0`.
            op (str): reduction operation, can be one of
                ('sum', 'prod', 'min' 'max'), arrays of complex type only
                support `'sum'`. Defaults to `'sum'`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        reduceNrw   )r3   rx   ry   r<   rg   r`   s         r&   r{   zNCCLBackend.reduce   s6     	xD"f=	? 	? 	? 	? 	?r(   c                 8    |                      d|||f           dS )a  Performs a broadcast operation.

        Args:
            in_out_array (cupy.ndarray): array to be sent for `root` rank.
                Other ranks will receive the broadcast data here.
            root (int, optional): rank of the process that will send the
                broadcast. Defaults to `0`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        	broadcastNrw   )r3   in_out_arrayr<   r`   s       r&   r}   zNCCLBackend.broadcast   s2     	,f5	7 	7 	7 	7 	7r(   c                 <    |                      d|||||f           dS )a/  Performs a reduce scatter operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
            count (int): Number of elements to send to each rank.
            op (str): reduction operation, can be one of
                ('sum', 'prod', 'min' 'max'), arrays of complex type only
                support `'sum'`. Defaults to `'sum'`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        reduce_scatterNrw   )r3   rx   ry   r$   rg   r`   s         r&   r   zNCCLBackend.reduce_scatter   s;     	xE2vF	H 	H 	H 	H 	Hr(   c                 :    |                      d||||f           dS )as  Performs an all gather operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
            count (int): Number of elements to send to each rank.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        
all_gatherNrw   )r3   rx   ry   r$   r`   s        r&   r   zNCCLBackend.all_gather   s9     	8Yv>	@ 	@ 	@ 	@ 	@r(   c                 8    |                      d|||f           dS )a  Performs a send operation.

        Args:
            array (cupy.ndarray): array to be sent.
            peer (int): rank of the process `array` will be sent to.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        sendNrw   )r3   r#   peerr`   s       r&   r   zNCCLBackend.send   s'     	f(=>>>>>r(   c                 8    |                      d|||f           dS )a2  Performs a receive operation.

        Args:
            array (cupy.ndarray): array used to receive data.
            peer (int): rank of the process `array` will be received from.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        recvNrw   )r3   ry   r   r`   s       r&   r   zNCCLBackend.recv   s'     	D&(ABBBBBr(   c                 :    |                      d||||f           dS )a  Performs a send and receive operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array used to receive data.
            peer (int): rank of the process to send `in_array` and receive
                `out_array`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        	send_recvNrw   )r3   rx   ry   r   r`   s        r&   r   zNCCLBackend.send_recv   s4     	(ItV<	> 	> 	> 	> 	>r(   c                 :    |                      d||||f           dS )a  Performs a scatter operation.

        Args:
            in_array (cupy.ndarray): array to be sent. Its shape must be
                `(total_ranks, ...)`.
            out_array (cupy.ndarray): array where the result with be stored.
            root (int): rank that will send the `in_array` to other ranks.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        scatterNrw   r3   rx   ry   r<   r`   s        r&   r   zNCCLBackend.scatter   s4     	)T6:	< 	< 	< 	< 	<r(   c                 :    |                      d||||f           dS )a  Performs a gather operation.

        Args:
            in_array (cupy.ndarray): array to be sent.
            out_array (cupy.ndarray): array where the result with be stored.
                Its shape must be `(total_ranks, ...)`.
            root (int): rank that will receive `in_array` from other ranks.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        gatherNrw   r   s        r&   r   zNCCLBackend.gather  s4     	xD&9	; 	; 	; 	; 	;r(   c                 8    |                      d|||f           dS )a  Performs an all to all operation.

        Args:
            in_array (cupy.ndarray): array to be sent. Its shape must be
                `(total_ranks, ...)`.
            out_array (cupy.ndarray): array where the result with be stored.
                Its shape must be `(total_ranks, ...)`.
            stream (cupy.cuda.Stream, optional): if supported, stream to
                perform the communication.
        
all_to_allNrw   )r3   rx   ry   r`   s       r&   r   zNCCLBackend.all_to_all  s2     	8Y7	9 	9 	9 	9 	9r(   c                 |    | j         r| j                                         dS | j                                         dS )zPerforms a barrier operation.

        The barrier is done in the cpu and is a explicit synchronization
        mechanism that halts the thread progression.
        N)r0   r>   rA   rS   rT   )r3   s    r&   rT   zNCCLBackend.barrier*  sC     = 	(N""$$$$$%%'''''r(   r   Nr   r   Nr   Nr,   )__name__
__module____qualname____doc__r   _DEFAULT_HOST_DEFAULT_PORTr.   r1   r2   r\   rc   rh   rt   rv   r{   r}   r   r   r   r   r   r   r   r   rT   __classcell__)r9   s   @r&   r*   r*   :   s          *1EC C C C C CE E EE E E"H H H
  
  3 3 3= = = =? ? ? ?$7 7 7 7  @DH H H H"@ @ @ @	? 	? 	? 	?	C 	C 	C 	C> > > >< < < <; ; ; ;9 9 9 9( ( ( ( ( ( (r(   r*   c                   H   e Zd Zedd            Zedd            Zedd            Ze	 dd            Zedd            Zedd	            Z	edd
            Z
edd            Zedd            Zedd            Zedd            Zedd            Zedd            ZdS )rk   r   Nc                 N   |                     |           |                     |           |                    |          }t          |          \  }}|                    ||j        j                  }|j                            |j        j	        |j        j	        ||||           d S r,   )
r\   rc   r'   rh   r   r   rE   	allReducedatarb   )clscommrx   ry   rg   r`   r   r$   s           r&   rv   z!_DenseNCCLCommunicator.all_reduce:  s    x(((y)))!!&))0::u\\"hn122
My~15%V	M 	M 	M 	M 	Mr(   r   c           	      f   |                     |           |j        |k    r|                     |           |                    |          }t          |          \  }}|                    ||j        j                  }|j                            |j	        j
        |j	        j
        |||||           d S r,   )r\   r5   rc   r'   rh   r   r   rE   r{   r   rb   )	r   r   rx   ry   r<   rg   r`   r   r$   s	            r&   r{   z_DenseNCCLCommunicator.reduceD  s    x(((9""9---!!&))0::u\\"hn122
My~15"dF	, 	, 	, 	, 	,r(   c                     |                     |           |                    |          }t          |          \  }}|j                            |j        j        |j        j        ||||           d S r,   )r\   rc   r'   rE   r}   r   rb   )r   r   r~   r<   r`   r   r$   s          r&   r}   z _DenseNCCLCommunicator.broadcastP  sx    |,,,!!&))0>>u
!<#4#85$	( 	( 	( 	( 	(r(   c                 P   |                     |           |                     |           |                    |          }t          ||          \  }}|                    ||j        j                  }|j                            |j        j	        |j        j	        ||||           d S r,   )
r\   rc   r'   rh   r   r   rE   reduceScatterr   rb   )r   r   rx   ry   r$   rg   r`   r   s           r&   r   z%_DenseNCCLCommunicator.reduce_scatterY  s     	x(((y)))!!&))05AAu\\"hn122
  My~15%V	M 	M 	M 	M 	Mr(   c                    |                     |           |                     |           |                    |          }t          ||          \  }}|j                            |j        j        |j        j        |||           d S r,   )r\   rc   r'   rE   	allGatherr   rb   )r   r   rx   ry   r$   r`   r   s          r&   r   z!_DenseNCCLCommunicator.all_gatherd  s    x(((y)))!!&))05AAu
My~15%	I 	I 	I 	I 	Ir(   c                     |                     |           |                    |          }t          |          \  }}|                     ||||||           d S r,   )r\   rc   r'   _send)r   r   r#   r   r`   r   r$   s          r&   r   z_DenseNCCLCommunicator.sendm  s\    u%%%!!&))077u		$tUE6:::::r(   c                 V    |j                             |j        j        ||||           d S r,   )rE   r   r   rb   r   r   r#   r   r   r$   r`   s          r&   r   z_DenseNCCLCommunicator._sendt  s(    

udFCCCCCr(   c                     |                     |           |                    |          }t          |          \  }}|                     ||||||           d S r,   )r\   rc   r'   _recv)r   r   ry   r   r`   r   r$   s          r&   r   z_DenseNCCLCommunicator.recvx  s\    y)))!!&))0;;u		$	4v>>>>>r(   c                 V    |j                             |j        j        ||||           d S r,   )rE   r   r   rb   r   r   ry   r   r   r$   r`   s          r&   r   z_DenseNCCLCommunicator._recv  s)    
	*E5$GGGGGr(   c                    |                     |           |                     |           |                    |          }t          |          \  }}t          |          \  }}	t          j                     |                     ||||||           |                     |||||	|           t          j                     d S r,   )r\   rc   r'   r   
groupStartr   r   groupEnd)
r   r   rx   ry   r   r`   idtypeicountodtypeocounts
             r&   r   z _DenseNCCLCommunicator.send_recv  s    x(((y)))!!&))28<<29==		$$???		$	4@@@r(   c           	      >   |j         d         |j        k    rt          d|j         d|j                    |                    |           |                    |           |                    |          }t          j                     ||j        k    rKt          |j                  D ]6}||         }t          |          \  }}	| 
                    |||||	|           7t          |          \  }
}|                     ||||
||           t          j                     d S )Nr   z"scatter requires in_array to have 'elements in its first dimension, found )shape
_n_devicesr[   r\   rc   r   r   r5   ranger'   r   r   r   )r   r   rx   ry   r<   r`   r
   r#   r   r   r   r$   s               r&   r   z_DenseNCCLCommunicator.scatter  s:   >!//KT_ K K:B.K KL L L 	x(((y)))!!&))494?++ B B !:5!A!A		$q&&&AAAA0;;u		$	4v>>>r(   c           	      >   |j         d         |j        k    rt          d|j         d|j                    |                    |           |                    |           |                    |          }t          j                     ||j        k    rKt          |j                  D ]6}||         }t          |          \  }}	| 
                    |||||	|           7t          |          \  }
}|                     ||||
||           t          j                     d S )Nr   z"gather requires out_array to have r   )r   r   r[   r\   rc   r   r   r5   r   r'   r   r   r   )r   r   rx   ry   r<   r`   r
   r#   r   r   r   r$   s               r&   r   z_DenseNCCLCommunicator.gather  s<    ?100LT_ L L:C/L LM M M 	x(((y)))!!&))494?++ B B!!!:5!A!A		$q&&&AAAA0::u		$$uf===r(   c           	         |j         d         |j        k    rt          d|j         d|j                    |j         d         |j        k    rt          d|j         d|j                    |                    |           |                    |           |                    |          }t          |d                   \  }}t          |d                   \  }}t          j                     t          |j                  D ]B}	| 	                    |||	         |	|||           | 
                    |||	         |	|||           Ct          j                     d S )Nr   %all_to_all requires in_array to have r   z&all_to_all requires out_array to have )r   r   r[   r\   rc   r'   r   r   r   r   r   r   )
r   r   rx   ry   r`   r   r   r   r   r
   s
             r&   r   z!_DenseNCCLCommunicator.all_to_all  s    ?100K K K:B.K KL L L ?100L L L:C/L LM M M 	x(((y)))!!&))28A;??29Q<@@t'' 	E 	EAIIdHQKFFFCCCIIdIaL!VVVDDDDr(   r   r   r   r,   )r   r   r   classmethodrv   r{   r}   r   r   r   r   r   r   r   r   r   r   rK   r(   r&   rk   rk   8  s       M M M [M 	, 	, 	, [	, ( ( ( [( DHM M M [M I I I [I ; ; ; [; D D D [D ? ? ? [? H H H [H 	 	 	 [	    [$    [&    [  r(   rk   c                 Z   t          j        d|           }t          j        dd          }t          j        dd          }|dk    rt          j        |||fd          S |dk    rt          j        |||fd          S |dk    rt          j        |||ffd          S t          d          )	N   r
   csr)r   r   )r   csccoo4NCCL is not supported for this type of sparse matrix)r^   emptyr   
csr_matrix
csc_matrix
coo_matrixr!   )r   sparse_typer   ar   s        r&   _make_sparse_emptyr     s    :aD
1cA
1cAe $1V<<<<			 $1V<<<<			 $Av>>>>BD D 	Dr(   c                     t          j        |           rdS t          j        |           rdS t          j        |           rdS t	          d          )Nr   r   r   r   )r   isspmatrix_cooisspmatrix_csrisspmatrix_cscr!   )matrixs    r&   _get_sparse_typer     se    V$$ Du		v	&	& Du		v	&	& DuBD D 	Dr(   c                      e Zd Zed             Zed             Zed             Zd Zedd            Zedd	            Z	edd
            Z
e	 dd            Zedd            Zedd            Zedd            Zedd            Zedd            Zedd            Zedd            Zedd            Zedd            ZdS )ro   c                    t          j        |          r(|                                 |j        |j        |j        fS t          j        |          st          j        |          r|j        |j        |j	        fS t          d          )Nr   )r   r   sum_duplicatesr   rowcolr   r   indptrindicesr!   )r   r#   s     r&   _get_internal_arraysz,_SparseNCCLCommunicator._get_internal_arrays  s     '' 	=  """J	5955"5)) 	=V-B5-I-I 	=Jem<<NOOOr(   c                 >    |t          d |D                       z   }|S )Nc              3   $   K   | ]}|j         V  d S r,   )r"   )rM   r   s     r&   	<genexpr>z?_SparseNCCLCommunicator._get_shape_and_sizes.<locals>.<genexpr>  s$      $<$<QV$<$<$<$<$<$<r(   )rU   )r   arraysr   sizes_shapes       r&   _get_shape_and_sizesz,_SparseNCCLCommunicator._get_shape_and_sizes  s+    
 e$<$<V$<$<$<===r(   c                    |j         r|dk    r5t          j        |d          }|j                            ||d           d S |dk    r5t          j        dd          }|j                            ||d           |S |d	k    rV|j        |k    rt          j        |d          }nt          j        dd          }|j                            ||
           |S |dk    rQt          j        |d          }t          j        |j	        dgd          }|j        
                    |||           |S |dk    rPt          j        |d          }t          j        |j	        dgd          }|j                            ||           |S t          d          t          j        d           |dk    r7t          j        |d          }|                     ||||j        d|           d S |dk    rIt          j        dd          }|                     ||||j        d|           t          j        |          S |d	k    rj|j        |k    rt          j        |d          }nt          j        dd          }t(                              ||||           t          j        |          S |dk    rft          j        |d          }t          j        |j	        dfd          }t(                              |||||           t          j        |          S |dk    ret          j        |d          }t          j        |j	        dfd          }t(                              ||||           t          j        |          S t          d          )Nr   r   r   r   )desttagr      )sourcer   rC   r;   r   alltoallzUnsupported methodzUsing NCCL for transferring sparse arrays metadata. This will cause device synchronization and a huge performance degradation. Please install MPI and `mpi4py` in order to avoid this issue.)r<   r`   )r`   )r0   numpyr#   r>   Sendr   Recvr5   Bcastr   GatherAlltoallr[   warningswarnr^   r   r   r   asnumpyrk   r}   r   r   )r   r   r   r   methodr`   recv_bufs          r&   _exchange_shape_and_sizesz1_SparseNCCLCommunicator._exchange_shape_and_sizes  s    = G	9#k+SAAA##Kd#BBBt6!! $k!3777##K!#DDD""7""9$$"'+k"E"E"EKK"'+as";";";K$$[t$<<<""8###k+SAAA ;';3GGG%%k8TBBB:%%#k+SAAA ;';3GGG''X>>>"#7888M%   "jC@@@		+t[->6K K Kt6!! #j#666		+t[->6K K K|K0007""9$$"&*["D"D"DKK"&*Qc":":":K&00+D 1 A A A|K0008##"jC@@@:t&:#FFF&--+xd6 . K K K|H---:%%"jC@@@:t&:#FFF&11+x 2 @ @ @|H---"#7888r(   c                    t          j        |           r=|d         | _        |d         | _        |d         | _        t          |          | _        d S t          j        |           st          j        |           r=|d         | _        |d         | _	        |d         | _
        t          |          | _        d S t          d          )Nr   r   r   r   )r   r   r   r   r   rU   _shaper   r   r   r   r!   )r   r   r   s      r&   _assign_arraysz&_SparseNCCLCommunicator._assign_arraysH  s     (( 	H )FKFJFJ!%LLFMMM"6** 	Hf.CF.K.K 	H )FK"1IFM#AYFN!%LLFMMMFH H Hr(   r   Nc                 n    d}|                      ||||||           |                     ||||           d S rj   )r{   r}   )r   r   rx   ry   rg   r`   r<   s          r&   rv   z"_SparseNCCLCommunicator.all_reduceW  sC     

49dB???dItV44444r(   r   c           
         |                      |          }|                     ||j                  }|                     |||d|          }|j        |k    rgt          |          t          |          k    rt          d          |}	t          |j        t          |                    }
t          |          D ]\  }}t          |dd                   }|dd          }d t          ||          D             }||k    rt          j                     |D ]&}|                     ||||j        |j        |           't          j                     |                     |
||           |dk    r|	|
z   }	|dk    r|	|
z  }	t          d          |                     ||                      |	          |	j                   d S t          j                     |D ]&}|                     ||||j        |j        |           't          j                     d S )	Nr   z.in_array and out_array must be the same formatr   r   c                 J    g | ] \  }}t          j        ||j                   !S r   r^   r   r   rM   sr   s      r&   rN   z2_SparseNCCLCommunicator.reduce.<locals>.<listcomp>q  s:       59QDJq000  r(   r   r   z.Sparse matrix only supports sum/prod reduction)r   r   r   r   r5   r   rf   r   r   	enumeraterU   zipr   r   r   r"   r   r   r   )r   r   rx   ry   r<   rg   r`   r   shape_and_sizesresultpartialr   ssr   sizesr   s                   r&   r{   z_SparseNCCLCommunicator.reduce_  si   ))(332268>JJ77$6; ;9))-=i-H-HHH DF F FF( 0 : :< <G &o66 N Nbb1g122 =@=O=O   4<<O%%%# J J		$4!&&IIIIMOOO&&w>>>U{{!''!1v!''!1(LN N N    333F;;V\K K K K K O < <		!T17AFF< < < <MOOOOOr(   c                    |                      |          }|j        |k    r|                     ||j                  }nd}|                     |||d|          }t          |dd                   }|dd          }|j        |k    rd t          ||          D             }t          j                     |D ]}	t          
                    ||	||            t          j                     |                     |||           d S )NrK   rC   r   r   c                 J    g | ] \  }}t          j        ||j                   !S r   r   r   s      r&   rN   z5_SparseNCCLCommunicator.broadcast.<locals>.<listcomp>  s@     M M M15A
1AG,,,M M Mr(   )r   r5   r   r   r   rU   r   r   r   rk   r}   r   r   )
r   r   r~   r<   r`   r   r   r   r  r   s
             r&   r}   z!_SparseNCCLCommunicator.broadcast  s-   )),779!66*, ,OO !O77$&: :oac*++#9M M9<UF9K9KM M MF 	 	D 	DA",,T1dFCCCC<77777r(   c           	      B   d}g }t          |t          t          f          st          d          |D ]S}	t	          |	j        t          |	                    }
|                     ||	|
|||           |                    |
           T| 	                    |||||           d S )Nr   z5in_array must be a list or a tuple of sparse matrices)
rl   rm   rU   rf   r   r   r   r{   appendr   )r   r   rx   ry   r$   rg   r`   r<   reduce_out_arrayss_mpartial_out_arrays              r&   r   z&_SparseNCCLCommunicator.reduce_scatter  s    
 (T5M22 	IGI I I 	8 	8C 2	+C00!2 !2JJtS"3T2vFFF$$%67777D+YfEEEEEr(   c                     d}g }|                      ||||           |j        |k    r fdt          |j                  D             }|D ]/}|                     ||||           |                    |           0d S )Nr   c                 T    g | ]$}t          j        t                              %S rK   )r   r   r   )rM   _rx   s     r&   rN   z6_SparseNCCLCommunicator.all_gather.<locals>.<listcomp>  s>     ! ! ! #8>3CH3M3MNN! ! !r(   )r   r5   r   r   r}   r  )	r   r   rx   ry   r$   r`   r<   gather_out_arraysarrs	     `      r&   r   z"_SparseNCCLCommunicator.all_gather  s     

4#4dFCCC9! ! ! !t//! ! ! % 	" 	"CMM$T6222S!!!!	" 	"r(   c           	      6   |                      |          }|                     ||j                  }|                     |||d|           t	          j                     |D ]&}|                     ||||j        |j        |           't	          j	                     d S )Nr   )
r   r   r   r   r   r   r   r   r"   r   )r   r   r#   r   r`   r   r   r   s           r&   r   z_SparseNCCLCommunicator.send  s    ))%002265;GG%%$	9 	9 	9 	 	> 	>AIIdAtQWaff====r(   c                     |j         j        }|t          vrt          d|j          d          t	          |          \  }}|                    |          }|j                            |j        j	        ||||           d S Nr   r   )
r   r   r    r!   r'   rc   rE   r   r   rb   r   s          r&   r   z_SparseNCCLCommunicator._send  s}     $$CU[CCCDDD077u!!&))

udFCCCCCr(   c           	         |                      ||dd|          }|                     |          }t          |dd                   }|dd          }d t          ||          D             }	t	          j                     |	D ]&}
|                     ||
||
j        |
j        |           't	          j	                     | 
                    ||	|           d S )NrK   r   r   r   c                 J    g | ] \  }}t          j        ||j                   !S r   r   r   s      r&   rN   z0_SparseNCCLCommunicator.recv.<locals>.<listcomp>  s-    LLLA
1AG,,,LLLr(   )r   r   rU   r   r   r   r   r   r"   r   r   )r   r   ry   r   r`   r   r   r   r  arrsr   s              r&   r   z_SparseNCCLCommunicator.recv  s    77$FF, , )))44oac*++#LLUF9K9KLLL 	> 	>AIIdAtQWaff====9dE22222r(   c                     |j         }|t          vrt          d|j         d          t	          |          \  }}|                    |          }|j                            |j        j	        ||||           d S r  )
r   r    r!   r   r'   rc   rE   r   r   rb   r   s          r&   r   z_SparseNCCLCommunicator._recv  s{    
$$GY_GGGHHH0;;u!!&))
	*E5$GGGGGr(   c                     t          j                     |                     ||||           |                     ||||           t          j                     d S r,   )r   r   r   r   r   )r   r   rx   ry   r   r`   s         r&   r   z!_SparseNCCLCommunicator.send_recv  sO    xv...y$///r(   c                 x   |j         |k    rt          j                     t          |          D ]#\  }}||k    r|                     ||||           $t          j                     |                     ||                     ||                   ||         j                   d S | 	                    ||||           d S r,   )
r5   r   r   r   r   r   r   r   r   r   )r   r   rx   ry   r<   r`   r   s_as           r&   r   z_SparseNCCLCommunicator.scatter  s     9O&x00 6 6	c4<<HHT3f555MOOO(($88$& & & & &
 HHT9dF33333r(   c                    |j         |k    rt          |j                  D ]}t          |j        t          |                    }||k    r|                     ||||           n/|                     ||                     |          |j	                   |
                    |           d S |                     ||||           d S r,   )r5   r   r   r   r   r   r   r   r   r   r  r   )r   r   rx   ry   r<   r`   r   ress           r&   r   z_SparseNCCLCommunicator.gather  s     9do.. 
& 
&(N$4X$>$>@ @4<<HHT3f5555&&00:: ( ( (   %%%%
& 
& HHT8T622222r(   c           
         t          |          |j        k    r't          d|j         dt          |                     g }g }t          |          D ]H\  }}|                     |          }	|                    |                     |	|j                             I|                     |||d|          }t          |j                  D ];}t          ||         dd                   }
||         dd          }|                     ||                   }d t          ||          D             }t          j                     |D ]&}|                     ||||j        |j        |           '|D ]&}|                     ||||j        |j        |           't          j                     |                    t'          ||         j        t)          ||                                        |                     ||         ||
           =d S )Nr   zelements, found r   r   r   c                 J    g | ] \  }}t          j        ||j                   !S r   r   r   s      r&   rN   z6_SparseNCCLCommunicator.all_to_all.<locals>.<listcomp>>  s@     O O O15A
1AG,,,O O Or(   )lenr   r[   r   r   r  r   r   r   r   rU   r   r   r   r   r   r"   r   r   r   r   r   )r   r   rx   ry   r`   r   recv_shape_and_sizesr
   r   r   r   r  s_arraysr_arrayss                 r&   r   z"_SparseNCCLCommunicator.all_to_all&  sJ    x==DO++3 3 3#&x==3 34 4 4
 !h'' 	N 	NDAq--a00F""3#;#;FAG#L#LMMMM"<<!_j& :  : t'' 	> 	>A.q1!A#677E(+ABB/E//<<HO O9<UH9M9MO O OHO ? ?		$1agqvv>>>> ? ?		$1agqvv>>>>MOOO/! !--/ / 0 0 0 y|Xu====!	> 	>r(   r   r   r   r,   )r   r   r   r   r   r   r   r   rv   r{   r}   r   r   r   r   r   r   r   r   r   r   rK   r(   r&   ro   ro     s,       P P [P   [ I9 I9 [I9VH H H 5 5 5 [5 + + + [+Z 8 8 8 [80 DHF F F [F  " " " ["& 	 	 	 [	 D D D [D 3 3 3 [3& H H H [H    [ 4 4 4 [4 3 3 3 [3" "> "> "> ["> "> ">r(   ro   r,   )%r   r   r^   	cupy.cudar   cupyx.distributedr   cupyx.distributed._commr   cupyx.scipyr   mpi4pyr   r/   ImportError	available	NCCL_INT8
NCCL_UINT8
NCCL_INT32NCCL_UINT32
NCCL_INT64NCCL_UINT64NCCL_FLOAT16NCCL_FLOAT32NCCL_FLOAT64r    NCCL_SUM	NCCL_PRODNCCL_MAXNCCL_MINre   r'   r*   rk   r   r   ro   rK   r(   r&   <module>r7     s            $ $ $ $ $ $ , , , , , ,      NN   NNN >  )))*****, ,L ' 'II
 LI	 	 	 	{( {( {( {( {(( {( {( {(|S S S S S S S SlD D D	D 	D 	D`> `> `> `> `> `> `> `> `> `>s   / 99