
    )`i`              
       $   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlZd dlZd dlmZmZmZmZmZ d dlZd dlZ	 d dlmZ n,# e$ r$ 	 d dlmZ n# e$ rZ ed          edZ[ww xY wY nw xY wdd	lmZ d
dlmZmZ d
dl m!Z! dZ"dZ#dZ$de%de%de%fdZ&de%de'dej(        de%dej)        f
dZ*de%de%de%de+fdZ,dee%         de%fdZ- G d de          Z.erd dl/m0Z0 d  Z1 G d! d"          Z2 G d# d$e.          Z3 G d% d&e.          Z4e G d' d(                      Z5 G d) d*          Z6 G d+ d,          Z7 G d- d.e          Z8 G d/ d0e8          Z9 G d1 d2e8          Z:d3e%de+fd4Z; G d5 d6          Z< G d7 d8          Z=dS )9    N)ABCabstractmethod)	dataclass)AnyDictListOptionalTYPE_CHECKING)driver)cudaz^Could not import the 'cuda' module. Please install cuda-python that matches your CUDA version.   )checkCudaErrors   )create_dlpack_capsulepack_strided_memory)Mapping	   i   Fvalgranreturnc                      | |z   dz
  |dz
   z  S )z6Efficient implementation assuming gran is a power of 2r    )r   r   s     i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/comm/mnnvl.pyround_upr   =   s    $JNqk))    ptrshapedtype	device_idc                    d}|D ]}||z  }t          j        g |                                          }t          | |||||          }t           j        j                            |j                  }||_        |	                    |          S )a0  
    Create a PyTorch tensor from a CUDA memory pointer using DLPack.

    Args:
        ptr: CUDA memory pointer address as integer
        shape: Desired tensor shape
        dtype: PyTorch data type
        device_id: CUDA device ID

    Returns:
        PyTorch tensor that wraps the CUDA memory
    r   )r   )
torchtensorelement_sizer   utilsdlpackfrom_dlpackcapsule_capsule_wrapperview)	r   r   r   r   numeldimr#   capsule_wrapperr"   s	            r   create_tensor_from_cuda_memoryr-   B   s      E   <%000==??L ,\<y O
 [++O,CDDF-F ;;ur   sizec                 T   	 t          d|          }t          |          }t          t          j        || |                     t          t          j        | ||                     t          d| d           dS # t          $ r!}t          d| dd|            Y d}~dS d}~ww xY w)	a  
    Test if CUDA memory at ptr is accessible by trying to read/write a small amount.

    Args:
        ptr: CUDA memory pointer
        size: Size of memory region
        device_id: CUDA device ID

    Returns:
        True if memory is accessible, False otherwise
       z+DEBUG: Memory access test PASSED for ptr=0xxTz+DEBUG: Memory access test FAILED for ptr=0x: NF)min	bytearrayr   r   cuMemcpyDtoHcuMemcpyHtoDprint	Exception)r   r.   r   	test_size	host_dataes         r   test_cuda_memory_accessr<   f   s    4LL	i((	 	))S)DDEEE 	)#y)DDEEECCCCCDDDt   HCHHHQHHIIIuuuuus   A8A< <
B'B""B'host_ptr_arrayc                 >   | sdS t           j        t          |           z  } ||  }t          j        |          }t	          t          j        |                    }t	          t          j        |t          j        |          |                     t          |          S )zj
    A helper function that allocates memory on cuda and copies the data from the host to the device.
    N)
ctypesc_uint64lensizeofr   r   
cuMemAllocr6   	addressofint)r=   	ArrayTypec_arraysize_in_bytes
device_ptrs        r   alloc_and_copy_to_cudarJ      s      t#n"5"55Ii(GM'**M#24?=3Q3Q#R#RJ*f&6w&?&?OO  
 z??r   c                       e Zd ZdZedefd            Zedefd            Zededee         fd            Z	ede
dede
fd            Zedd
            Zedededd fd            Zd	S )CommBackendz(Abstract communication backend interfacer   c                     d S Nr   selfs    r   Get_rankzCommBackend.Get_rank       "sr   c                     d S rN   r   rO   s    r   Get_sizezCommBackend.Get_size   rR   r   datac                     d S rN   r   rP   rU   s     r   	allgatherzCommBackend.allgather   s    14r   rootc                     d S rN   r   rP   rU   rY   s      r   bcastzCommBackend.bcast   s    25#r   Nc                     d S rN   r   rO   s    r   barrierzCommBackend.barrier   rR   r   colorkeyc                     d S rN   r   rP   r_   r`   s      r   SplitzCommBackend.Split   s    <?Cr   r   N)__name__
__module____qualname____doc__r   rE   rQ   rT   r   rX   r   r\   r^   rc   r   r   r   rL   rL      s        22"#""" ^""#""" ^"4c4d3i444 ^45#5S5S555 ^5""" ^"?3?S?]??? ^???r   rL   MPIc                  X    	 ddl m}  | S # t          $ r}t          d          |d}~ww xY w)zLazy import for mpi4pyr   ri   zmpi4py is not installedN)mpi4pyrj   ImportError)rj   errs     r   lazy_import_mpiro      sN    >
 > > >344#=>s   
 
)$)c                   d    e Zd ZU dZeed<   dZeed<   ed             Zedefd            Z	d Z
dS )MpiCommN_comm_MPIc                 f    | j         $t                      | _         | j         j        | _        | j         S rN   )rs   ro   
COMM_WORLDrr   )clss    r   _get_mpizMpiComm._get_mpi   s*    8&((CH+CIxr   new_commc                 <    |                                   || _        d S rN   )rw   rr   )rv   rx   s     r   set_mpi_commzMpiComm.set_mpi_comm   s    			r   c                 b    | j         |                                  t          | j         |          S rN   )rr   rw   getattr)rP   names     r   __getattr__zMpiComm.__getattr__   s)    :MMOOOtz4(((r   )re   rf   rg   rr   r   __annotations__rs   classmethodrw   rz   r~   r   r   r   rq   rq      s         E3D#  [ C    [
) ) ) ) )r   rq   c                   v    e Zd Zd ZdefdZdefdZdedee         fdZde	dede	fdZ
d	 Zd
ededefdZdS )
MPIBackendc                 ,    t                      | _        d S rN   )rq   _mpicommrO   s    r   __init__zMPIBackend.__init__   s    		r   r   c                 4    | j                                         S rN   )r   rQ   rO   s    r   rQ   zMPIBackend.Get_rank       }%%'''r   c                 4    | j                                         S rN   )r   rT   rO   s    r   rT   zMPIBackend.Get_size   r   r   rU   c                 6    | j                             |          S rN   )r   rX   rW   s     r   rX   zMPIBackend.allgather   s    }&&t,,,r   rY   c                 8    | j                             ||          S rN   )r   r\   r[   s      r   r\   zMPIBackend.bcast   s    }""4...r   c                 8    | j                                          d S rN   )r   BarrierrO   s    r   r^   zMPIBackend.barrier   s    r   r_   r`   c                 ^    | j                             ||          | _         t                      S rN   )r   rc   r   rb   s      r   rc   zMPIBackend.Split   s%    ++E377||r   N)re   rf   rg   r   rE   rQ   rT   r   rX   r   r\   r^   rL   rc   r   r   r   r   r      s        " " "(# ( ( ( ((# ( ( ( (-c -d3i - - - -/# /S /S / / / /     3 S [      r   r   c                       e Zd ZdZddee         fdZdefdZdefdZ	dede
e         fd	Zded
edefdZddZdededd fdZdS )TorchDistBackendz-Communication backend using torch.distributedNgroupc                 t    ddl m} |                                st          d          || _        || _        dS )z
        Initialize TorchDistBackend.

        Args:
            group: Optional process group. If None, uses the default process group.
        r   Nz_torch.distributed is not initialized. Please call torch.distributed.init_process_group() first.)torch.distributeddistributedis_initializedRuntimeError_group_dist)rP   r   dists      r   r   zTorchDistBackend.__init__   sV     	)(((((""$$ 	L   


r   r   c                 @    | j                             | j                  S rN   )r   get_rankr   rO   s    r   rQ   zTorchDistBackend.Get_rank   s    z""4;///r   c                 @    | j                             | j                  S rN   )r   get_world_sizer   rO   s    r   rT   zTorchDistBackend.Get_size  s    z((555r   rU   c                 z    dg|                                  z  }| j                            ||| j                   |S )z5All-gather arbitrary Python objects across all ranks.Nr   )rT   r   all_gather_objectr   )rP   rU   output_lists      r   rX   zTorchDistBackend.allgather  s;    ft}}.
$$[$dk$JJJr   rY   c                 \    |g}| j                             ||| j                   |d         S )z1Broadcast a Python object from root to all ranks.)srcr   r   )r   broadcast_object_listr   )rP   rU   rY   object_lists       r   r\   zTorchDistBackend.bcast  s2    f
(($dk(RRR1~r   c                 F    | j                             | j                   d S )Nr   )r   r^   r   rO   s    r   r^   zTorchDistBackend.barrier  s#    
-----r   r_   r`   c                 z   |                                  }|                     |||f          }i }|D ],\  }}}||vrg ||<   ||                             ||f           -|D ]}||                             d             d ||         D             }	| j                            |	          }
t          |
          S )a  
        Split the communicator into sub-groups based on color.

        All processes with the same color will be in the same new group.
        The key determines the rank ordering within the new group.

        Args:
            color: Processes with the same color are placed in the same group
            key: Determines rank ordering within the new group (lower key = lower rank)

        Returns:
            New TorchDistBackend with the split process group
        c                     | d         S )Nr   r   )r1   s    r   <lambda>z(TorchDistBackend.Split.<locals>.<lambda>0  s
    qt r   )r`   c                     g | ]\  }}|S r   r   ).0_rs      r   
<listcomp>z*TorchDistBackend.Split.<locals>.<listcomp>3  s    <<<1!<<<r   )ranksr   )rQ   rX   appendsortr   	new_groupr   )rP   r_   r`   global_rankall_infocolor_groupsckr   my_group_ranksr   s              r   rc   zTorchDistBackend.Split  s     mmoo>>5#{";<< 02 	+ 	+GAq!$$"$QO""Aq6****  	5 	5AO  ^^ 4444 =<U(;<<< J((~(>>	i0000r   rN   rd   )re   rf   rg   rh   r	   r   r   rE   rQ   rT   r   rX   r\   r^   rc   r   r   r   r   r      s       77 hsm    "0# 0 0 0 06# 6 6 6 6c d3i    # S S    . . . .$13 $1S $1-? $1 $1 $1 $1 $1 $1r   r   c                   J    e Zd ZU dZdZee         ed<   dZe	ed<   dZ
e	ed<   dS )MnnvlConfigz)Configuration for MNNVL memory managementNcomm_backendr   allocation_granularity    fabric_page_size)re   rf   rg   rh   r   r	   rL   r   r   rE   r   r   r   r   r   r   ;  sO         33*.L(;'..."#C####c#####r   r   c                       e Zd ZU dZeed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed	<   d
Zee         ed<   d
Zeed<   i Zeeef         ed<   i Zeeef         ed<   d
Zee         ed<   dedefdZd Zd Zed             Zed#dedefd            Zedefd            Zedefd            Zedefd            Zededefd            Z ededefd            Z!edefd            Z"ed$defd             Z#ed!efd"            Z$d
S )%MnnvlMemoryFinitializedr   current_mem_offsetcurrent_rank_stridecurrent_start_addressr   r   r   Ncommdev_idallocated_mapaddress_refcntconfigmappingr.   c                 |    || _         || _        t                              | j         |          \  | _        | _        d S rN   )r   segment_sizer   open_mnnvl_memoryr   rank_stride)rP   r   r.   s      r   r   zMnnvlMemory.__init__[  s9     %0%B%B4<QU%V%V"$"""r   c                 n    t          j                    s!t                              | j                   d S d S rN   )sysis_finalizingr   close_mnnvl_memoryr   rO   s    r   __del__zMnnvlMemory.__del__`  s9     "" 	5**4844444	5 	5r   c                     t           j                                        }t          | j        | j        | j        ||t           j                  S rN   )r   r   rT   r   r   r   r   r   )rP   r   num_segmentss      r   as_torch_strided_tensorz#MnnvlMemory.as_torch_strided_tensord  sE    "'0022"H
 
 	
r   c                      t           j        sat          j        dd          } 	 t	          j                     n(# t          j        $ r t	          j                     Y nw xY wdt           _        d S d S )Nr   r   )deviceT)r   r   r!   emptypynvmlnvmlDeviceGetCountNVMLError_UninitializednvmlInit)r   s    r   
initializezMnnvlMemory.initializeo  s    & 	+Af---A")++++1 " " "!!!!!"&*K###	+ 	+s   8 "AAc                     |pt          t                                t          _        |j                            | j        | j        z  | j        z   | j	                  }|t          _
        d S )N)r   )r   r   r   r   r   rc   pp_rankcp_sizecp_ranktp_rankr   )r   r   r   s      r   set_comm_from_configz MnnvlMemory.set_comm_from_config{  s]    #M{
'M'M'M"((Ogo-?
 
  r   c                     t           j        t           j        S t                                          | j        | j        z  | j        z   | j                  }|t           _        |S rN   )r   r   rq   rc   r   r   r   r   )r   r   s     r   get_commzMnnvlMemory.get_comm  sQ    '##yyOgo-?
 
  r   c                 v   t          j                    }t           j        j        |_        | |_        t          j                    }t           j        j        |_        t          j
                                                    }d|v }|rt           j        j        |_        nt           j        j        |_        ||_        |S )Naarch64)r   CUmemLocationCUmemLocationTypeCU_MEM_LOCATION_TYPE_DEVICEtypeidCUmemAllocationPropCUmemAllocationTypeCU_MEM_ALLOCATION_TYPE_PINNEDplatformmachinelowerCUmemAllocationHandleTypeCU_MEM_HANDLE_TYPE_FABRICrequestedHandleTypes(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTORlocation)r   r   allocation_proparchis_on_aarch64s        r   get_allocation_propzMnnvlMemory.get_allocation_prop  s    %''.J244#7U !!''))!T) 	.H 00
 .W 0 $, r   c                 *   t           j        dk    rt           j        S t                               |           }t          j        t          j        j                  }t          t          j        ||                    }|t           _        t           j        S )Nr   )propoption)r   r   r   r    CUmemAllocationGranularity_flags$CU_MEM_ALLOC_GRANULARITY_RECOMMENDEDr   cuMemGetAllocationGranularity)r   r   r   granularitys       r   get_allocation_granularityz&MnnvlMemory.get_allocation_granularity  s~    -2255%99&AA61V
 
 &.OFSSS
 
 .9*11r   c                    |t           j        z   dz
  t           j        z  }|t           j        z  }t          j        d|            t                               |           }|                                }||z  }t          t          j        |t           j        dd                    }t          |          t           _
        |t           _        dt           _        d S )Nr   z+[MnnvlMemory] creating address with stride=r   )r   r   logginginfor   rT   r   r   cuMemAddressReserverE   r   r   r   )r   r.   
page_countr   r   	comm_sizeaddress_sizer   s           r   new_mnnvl_memory_addressz$MnnvlMemory.new_mnnvl_memory_address  s     ;//!3)*
 );+GGO:MOO	
 	
 	
 ##G,,MMOO	*Y6$\;3OQRTUVV
 
 -0HH)*=')*&&&r   c                 
   t          t          j                              }t          |          }t          j        |t          _        |t          j        k    sJ d| dt          j                     t                              |           }|                                }|                                }|	                              }t          |          |k    sJ t          fd|D                       s
J d            t                              |          }|z   dz
  |z  |z  }	t          j        |	z   t          j        k    rt                              | |	           t          j        |	z   t          j        k    sJ t                              |          }
t          t          j        |	|
d                    }t          t          j        ||
j        d                    }|
j        t          j        j        k    r|	                    |j                  }na|	                    |          }|	                    t/          j                              }t3          j        d d	          }|j        }d
}d}g }|D ]g} |||d          }|dk     r=t3          j                    }t;          d| d| dt/          j        |                     |                    |           hg }tA          ||d          D ]\  }} ||||d          }|dk     rTt3          j                    }d| d| d| dt/          j        |           d	}|dk    r|dz  }n|dz  }t;          |          |                    |           |}t          j!                    }|
j"        |_"        t          j#        j$        |_%        d g|z  }tM          |          D ]\  }}t          j'        t          j        |z  z   t          j        z   }||k    r+|||<   t          t          j(        ||	d|d                     nQt          t          j)        ||
j                            } | ||<   t          t          j(        ||	d| d                     t          t          j*        ||	|gd                     t          j'        t          j        z   }!t          j        }"| |	|t          j'        t          j        t          j        ft          j+        |!<   t          j,        -                    t          j'        d          dz   t          j,        t          j'        <   t          xj        |	z  c_        |!|"fS )NzDifferent dev_id found dev_id=z but MnnvlMemory.dev_id=c              3   $   K   | ]
}|k    V  d S rN   r   )r   r1   r.   s     r   	<genexpr>z0MnnvlMemory.open_mnnvl_memory.<locals>.<genexpr>  s'      >>19>>>>>>r   z"Not all rank allocating same size.r   r   )flagsT)	use_errnoi  i  zpidfd_open(z) failed with errno r2   )strictzpidfd_getfd(pidfd=z, fd=.zj Permission denied. If running in a container, try adding --cap-add=SYS_PTRACE to your docker run command.z9 This may be due to kernel version (requires Linux 5.6+).).r   r   cuCtxGetDevicerE   r   r   r   rQ   rT   rX   rA   allr  r   r   r  r   cuMemCreatecuMemExportToShareableHandler   r   r   rU   osgetpidr?   CDLLsyscall	get_errnor   strerrorr   zipCUmemAccessDescr   CUmemAccess_flags"CU_MEM_ACCESS_FLAGS_PROT_READWRITEr  	enumerater   cuMemMapcuMemImportFromShareableHandlecuMemSetAccessr   r   get)#r   r.   devr   r   	comm_rankr	  all_rank_allocate_sizesr  aligned_sizer   allocated_mem_handleexported_fabric_handleall_handles_dataall_pidslibcr  SYS_pidfd_openSYS_pidfd_getfdpidfdspidpidfdrn   
remote_fdsfd	remote_fd	error_msgmadescmem_handlesiremote_handle_datarank_ptrimported_mem_handler   strides#    `                                 r   r   zMnnvlMemory.open_mnnvl_memory  s   d13344S%!'K++++aVaa[M_aa ,++ ##G,,MMOO	MMOO	"&.."6"6*++y8888>>>>&=>>>>> 	
 	
0	
 	
> "<<VDD{*Q.;>L *\9-. . 00,GGG *\9./ / / / &99&AA.\?!DDD 
  
 "1-$o&JA "
 "
 0-GH H  $~~.D.IJJ#~~.DEE~~bikk22H;tt444DlG N!OF % %Q77199 *,,C&XcXXsXXbkRUFVFVXX   e$$$$J )9$GGG - -	r#GOUBBB	q== *,,C sU s s s sY\ s s`b`klo`p`p s s sIaxx!:		
 "W	 'y111!!),,,,) %'')2-Pfy(%./?%@%@ 	V 	V!A!11A5601 
 I~~!5AM(L!=QSTUU   
 '67*O,P ' '#
 "5AM(L!=PRSTT   D/,RSTTUUUU/+2PP0-+**
!#& &**;+LaPPSTT 	";#DE 	&&,6&&F{r   r   c                    t           j                            |           \  }}}}}}t                               |          }|                                }t          |          D ]V}	||	|z  z   |z   }
t          t          j        |
|                     t          t          j	        ||	                              Wt           j
        |xx         dz  cc<   t           j
        |         dk    rt           j
                            |           t          j        |          }t          t          j        |||z                       |t           j        k    r(dt           _        dt           _        dt           _        d S d S d S )Nr   r   )r   r   popr   rT   ranger   r   
cuMemUnmapcuMemReleaser   CUdeviceptrcuMemAddressFreer   r   r   )r   r   r)  r9  start_addressr   address_offsetr   r	  r:  r<  rI   s               r   r   zMnnvlMemory.close_mnnvl_memoryI  si    %))#..	
##G,,MMOO	y!! 	? 	?A$q;6GHDOHlCCDDDD-k!n==>>>>"=111Q6111%m499&**=999)-88JD1*i+>UVVWWW AAA45123/12... :9 BAr   Tneed_all_upc                    t           j                                        }t          j        |          }t          j        }d}d}t          |          D ]Y}	 t          j        ||t          j                  r!|dz  }t          j	        ||          }|r|dz  }E# t          j
        $ r Y Vw xY w| r||k    o|dk    n|dk    S )Nr   r   )r!   r   current_devicer   nvmlDeviceGetHandleByIndexNVML_NVLINK_MAX_LINKSrA  nvmlDeviceGetNvLinkCapabilityNVML_NVLINK_CAP_P2P_SUPPORTEDnvmlDeviceGetNvLinkStateNVMLError_NotSupported)rH  r   handle
link_countactive_linksavailable_linkslink_idx	is_actives           r   support_nvlinkzMnnvlMemory.support_nvlinkd  s    **,,26::1
j)) 
	 
	H	7Hf&J  * $q(O & ? Q QI  *$)0    %LO+C!0C 1$	
s   ABB)(B)r   c                  :    t                               d          } | S )NT)r   rW  )support_nvlink_and_all_ups    r   supports_mnnvlzMnnvlMemory.supports_mnnvl|  s     %0$>$>t$D$D!((r   rN   T)%re   rf   rg   r   boolr   r   rE   r   r   r   r   r   r	   rL   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r   r   r   r  r  r   r   rW  rZ  r   r   r   r   r   D  s        K    !"3""" #$C### $c### #'D(;
&&&FC$&M4S>&&&%'NDcN'''$(FH[!(((W Ws W W W W
5 5 5	
 	
 	
 	+ 	+ \	+    g  {       \  '    \ C    \* 23 2 2 2 \2 +' + + + + \+$ B7 B# B B B \BH 3 3 3 3 \34 
 
D 
 
 
 \
. )D ) ) ) \) ) )r   r   c                   R    e Zd ZdZddedefdZddeded	ee         fd
Zd Zd Z	dS )	IpcSocketz2Unix Domain Socket for IPC file descriptor passingTrankop_idc                    || _         || _        || _        t          j        t          j        t          j                  | _        d| d|d}|rd|z   | _        nL|| _        t          j	        t                    5  t          j        |           ddd           n# 1 swxY w Y   | j                            | j                   dS )z
        Initialize IPC socket

        Args:
            rank: Process rank
            op_id: Unique operation ID (hash)
            use_abstract: Use Linux abstract socket namespace
        /tmp/mcastmem-socket--r1    N)r`  ra  use_abstractsocketAF_UNIX
SOCK_DGRAMsocksocket_path
contextlibsuppressFileNotFoundErrorr  unlinkbind)rP   r`  ra  rf  socket_names        r   r   zIpcSocket.__init__  s    	
( M&.&2CDD	 ?d>>U>>> 	'#k1D*D$%677 ' '	+&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' 		t'(((((s   :BB"BNr5  	dest_rank
dest_op_idc                    |p| j         }d| d|d}| j        rd|z   }n|}d}t          j        d|g          }t          j        t          j        |                                fg}| j                            |g|d|           dS )	z
        Send a file descriptor to another process

        Args:
            fd: File descriptor to send
            dest_rank: Destination process rank
            dest_op_id: Destination operation ID
        rc  rd  r1   re      r:  r   N)	ra  rf  arrayrg  
SOL_SOCKET
SCM_RIGHTStobytesrj  sendmsg)	rP   r5  rr  rs  dest_socket_name	dest_path
dummy_datafds	ancillarys	            r   send_fdzIpcSocket.send_fd  s      -4:
M9MMzMMM 	)//II(I 
 k#t$$'):CKKMMJK	 		:,	1i@@@@@r   c           	         t          j         d          }| j                            dt          j        |j                            \  }}}}|D ]\  }}}|t          j        k    rp|t          j        k    r`t          j         d          }|                    |dt          |          t          |          |j        z  z
                      |d         c S t          d          )z|
        Receive a file descriptor from another process

        Returns:
            int: Received file descriptor
        r:  r   Nr   zNo file descriptor received)rv  rj  recvmsgrg  
CMSG_SPACEitemsizerw  rx  	frombytesrA   r   )	rP   r~  msgancdatar  addr
cmsg_level	cmsg_type	cmsg_datas	            r   recv_fdzIpcSocket.recv_fd  s     k#$(I$5$5 %
 %
!WeT 18 	 	,J	9V...9@Q3Q3Qk#&&PI#i..3<2O PPQ   1v8999r   c                     | j                                          | j        sT| j        rOt	          j        t                    5  t          j        | j                   ddd           dS # 1 swxY w Y   dS dS dS )zClose the socketN)	rj  closerf  rk  rl  rm  rn  r  ro  rO   s    r   r  zIpcSocket.close  s    	  	,T%5 	,$%677 , ,	$*+++, , , , , , , , , , , , , , , , , ,	, 	, 	, 	,s   A((A,/A,r[  rN   )
re   rf   rg   rh   rE   r   r	   r  r  r  r   r   r   r_  r_    s        <<) )S ) ) ) ) )>A A# A# A8C= A A A A:: : :8, , , , ,r   r_  c                       e Zd ZdZdddedefdZeedej	        fd                        Z
edefd	            Zed
efd            Zedd            Zedd            ZdS )HandleExchangerzFAbstract interface for exchanging CUDA shareable handles across ranks.r   rL   
group_rank
group_sizec                 0    || _         || _        || _        d S rN   )r   r`  r.   )rP   r   r  r  s       r   r   zHandleExchanger.__init__  s     					r   r   c                     dS )z/The CUDA handle type this exchanger works with.Nr   rO   s    r   handle_typezHandleExchanger.handle_type  s	     	r   c                     dS )z,All-gather shareable handles from all ranks.Nr   rP   local_handles     r   rX   zHandleExchanger.allgather  	     	r   rY   c                     dS )z*Broadcast a handle from root to all ranks.Nr   rP   rQ  rY   s      r   	broadcastzHandleExchanger.broadcast  r  r   Nc                     d S rN   r   rP   rQ  s     r   cleanupzHandleExchanger.cleanup  s    '*sr   c                     d S rN   r   rO   s    r   r  zHandleExchanger.close  s     Sr   rd   )re   rf   rg   rh   rE   r   propertyr   r   r   r  r   rX   r  r  r  r   r   r   r  r    s        PP]  QT    
 T;    ^ X     ^ c    ^ *** ^*    ^   r   r  c                   `    e Zd ZdZedej        fd            ZdefdZ	de
fdZd
dZd
d	ZdS )FabricHandleExchangerzEHandle exchange using CUDA Fabric handles via MPI/collective backend.r   c                 $    t           j        j        S rN   )r   r   r   rO   s    r   r  z!FabricHandleExchanger.handle_type  s    -GGr   c                 @    | j                             |j                  S rN   )r   rX   rU   r  s     r   rX   zFabricHandleExchanger.allgather  s    y""<#4555r   rY   c                 L    | j                             |r|j        nd |          S )NrY   )r   r\   rU   r  s      r   r  zFabricHandleExchanger.broadcast  s%    yf>v{{$TJJJr   Nc                     d S rN   r   r  s     r   r  zFabricHandleExchanger.cleanup      r   c                     d S rN   r   rO   s    r   r  zFabricHandleExchanger.close  r  r   rd   )re   rf   rg   rh   r  r   r   r  r   rX   rE   r  r  r  r   r   r   r  r  	  s        OOHT; H H H XH6 6 6 6 6Kc K K K K        r   r  c                        e Zd ZdZdddedef fdZdefdZede	j
        fd	            Zdefd
ZdefdZddZddZ xZS )PosixFDHandleExchangerz=Handle exchange using POSIX file descriptors via IPC sockets.r   rL   r  r  c                     t                                          |||           |                                 | _        d S rN   )superr   _init_ipc_socket_socket)rP   r   r  r  	__class__s       r   r   zPosixFDHandleExchanger.__init__   s6    z:>>>,,..r   r   c                     | j         dk    rt          j        dd          }nd }| j                            |d          }t          | j         |          S )Nr   l    r  )r`  randomrandintr   r\   r_  )rP   opIds     r   r  z'PosixFDHandleExchanger._init_ipc_socket$  sN    9>>>!Y//DDDyt!,,D)))r   c                 $    t           j        j        S rN   )r   r   r   rO   s    r   r  z"PosixFDHandleExchanger.handle_type,  s    -VVr   c                 >   d g| j         z  }t          | j                   D ]|}| j                                         | j                            || j        |z   | j         z             | j        | j         z   |z
  | j         z  }| j                                        ||<   }|S rN   )r.   rA  r   r^   r  r  r`  r  )rP   r  resultr:  r   s        r   rX   z PosixFDHandleExchanger.allgather0  s    $)#ty!! 	1 	1AIL  	A/JKKK9ty(1,	9C,..00F3KKr   rY   c                    | j         |k    rNt          d| j                  D ]6}| j                                         | j                            ||           7|S t          | j                   D ]}| j                                         | j                                        }t          | j        | j         z
  dz
            D ]}| j                                         |S )Nr   )r`  rA  r.   r   r^   r  r  r  )rP   rQ  rY   pr   r  s         r   r  z PosixFDHandleExchanger.broadcast9  s    91di(( 0 0	!!###$$VQ////M 49%% $ $	!!####\))++F49ty01455 $ $	!!####Mr   Nc                 .    t          j        |           d S rN   )r  r  r  s     r   r  zPosixFDHandleExchanger.cleanupH  s    
r   c                 8    | j                                          d S rN   )r  r  rO   s    r   r  zPosixFDHandleExchanger.closeK  s    r   rd   )re   rf   rg   rh   rE   r   r_  r  r  r   r   r  r   rX   r  r  r  __classcell__)r  s   @r   r  r    s       GG/] / /QT / / / / / /*) * * * * WT; W W W XW    c              r   r  
device_idxc                    t          t          j        t          j        j        |                     }|dk    rdS t          j                     	 t          j        |           }t          j                    }t          j	        |t          j        |                     |j        t
          j        k    r'|j        d         dk    r	 t          j                     dS 	 t          j                     dS # t          j                     w xY w)Nr   FT)r   r   cuDeviceGetAttributeCUdevice_attribute0CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTEDr   r   rK  c_nvmlGpuFabricInfoV_tnvmlDeviceGetGpuFabricInfoVr?   byrefstateNVML_GPU_FABRIC_STATE_COMPLETEDclusterUuidnvmlShutdown)r  fabric_handle_supportedrQ  fabric_infos       r   is_mnnvl_fabric_supportedr  O  s    -!#T	
 	
  !##u
O2:>>355*66<3L3LMMM!GGG'*a// 	 s   A4C. .Dc                   2   e Zd ZdZ	 	 	 d!dededededee         d	ed
efdZd Z	de
e         fdZde
e         fdZdefdZdefdZdedefdZdefdZdefdZdefdZdefdZdefdZded	efdZd ZdefdZd Zd Zd Zdedej        fd ZdS )"SymmDeviceMemoryz1Python port of SymmDeviceMemory from TensorRT-LLMNTbuf_sizer  r  r   comm_backend_for_handle_transferenable_multicastallocate_signal_padsc           
         t          t          j        |                    }t          t          j        |                    }	t          t          j        |	                     ddlm}
  |
            rdd lm} n	dd l	m
c m} t          |                    |                     || _        || _        || _        || _        d| _        d| _        |pt'                      | _        d| _        g | _        g | _        d| _        d| _        d| _        g | _        d| _        t:          | _        t          t          j        t          j        j         |                    }|dk    rtC          d          tE          || j                  | _        tG          j$        d| d| d| d| j                    tK          |          r&tM          | j        | j        | j                  | _'        n%tQ          | j        | j        | j                  | _'        | )                    ||           |rdg| j        z  | _        tU          | j                  D ]]}| j        |         | j        z   | j        |<   || j        k    r3t          t          j+        | j        |         d| j                             ^tY          | j                  | _        tY          | j                  | _        d S )	Nr   )has_cuda_cudart   z8[SymmDeviceMemory] Device does not support multicasting.z[SymmDeviceMemory] Rank: z, Group size: z, device_idx: z, Signal pad offset: )-r   r   cuDeviceGetcuDevicePrimaryCtxRetaincuCtxSetCurrentflashinfer.utilsr  cuda.cudartcudartcuda.bindings.runtimebindingsruntimecudaSetDevicer  r  r  r  signal_pad_offsetallocation_sizer   r   mc_ptruc_ptrssignal_padssignal_pads_devuc_ptrs_dev	mc_handle
uc_handlesSIGNAL_PAD_ALIGNMENTSIGNAL_PAD_SIZEr  r  'CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTEDr   r   r  r  r  r  
_exchangerr  _alloc_mn_mcast_memrA  
cuMemsetD8rJ   )rP   r  r  r  r  r  r  r  	cu_deviceprimary_ctxr  r  multicast_supportedr:  s                 r   r   zSymmDeviceMemory.__init__l  s"    $D$4Z$@$@AA	%d&CI&N&NOO,[99::: 	544444? 	3((((((( 322222222,,Z88999$$$ !" <L
 "$&(   	
 %'!. .%'O 
 
 !##J  
 "*(D4M!N!N;
 ; ;* ; ;%; ;"&"8; ;	
 	
 	
 %Z00 	/D!4?DO0 0DOO 5!4?DO DO 	  +;<<< 
	L !sT_4D4?++  &*l1o8N&N #''#(8(;Q@TUU   $:$:J#K#KD 1$,??r   c                    t          | d          r| j                                         t          j                    rdS 	 t          j                     n*# t          $ r}t          d|            Y d}~dS d}~ww xY w| j	        r&t          t          j        | j	                             | j        r&t          t          j        | j                             t          | d          r$| j        rt          | j                  D ]}| j        |         dk    r	 t          t          j        | j        |                              |t#          | j                  k     r?| j        |         r2t          t          j        | j        |         | j                             # t          $ r}t          d| d|            Y d}~d}~ww xY wt          | d          r3| j        r,t          t          j        | j        | j                             t          | d	          r| j        r| j        dk    r	 t          t          j        | j        | j                             t          t          j        | j        | j                             t          t          j        | j                             dS # t          $ r}t          d
|            Y d}~dS d}~ww xY wdS dS dS )z%Destructor - cleanup allocated memoryr  Nz4Destructor: CUDA context invalid, skipping cleanup: r  r   z1Destructor: Failed to release UC handle for rank r2   uc_base_ptrr  z)Destructor: Failed to release MC handle: )hasattrr  r  r   r   r   cuCtxGetCurrentr8   r7   r  r   	cuMemFreer  r  rA  r  rC  rA   r  rB  r  r  rE  total_uc_sizer  r  )rP   r;   r`  s      r   r   zSymmDeviceMemory.__del__  sZ    4&& 	$O!!###  	F	 """" 	 	 	LLLMMMFFFFF	
  	BDN4+?@@AAA 	>DN4+;<<=== 4&& 	4? 	do..  ?4(A--'(9$/$:O(P(PQQQ#dl"3"333T8J3+ $$(L$68L!" !"  
 %   [PT[[XY[[        ." t]++ 0@ )$*:D<NOO  
 4%% 	G$. 	GT^q=P=PGT=Q R RSSS)$+t7KLL     1$. A ABBBBB G G GE!EEFFFFFFFFFG	G 	G 	G 	G=P=PsJ    A 
A;A66A;BF
G'GG0A>J0 0
K:KKr   c                     | j         S zFGet the raw array of signal pad pointers to all ranks (including self))r  rO   s    r   get_signal_pad_ptrs_hostz)SymmDeviceMemory.get_signal_pad_ptrs_host      r   c                     | j         S zCGet the raw array of unicast pointers to all ranks (including self))r  rO   s    r   get_buffer_ptrs_hostz%SymmDeviceMemory.get_buffer_ptrs_host  s
    |r   c                     | j         S r  )r  rO   s    r   get_signal_pad_ptrs_devz(SymmDeviceMemory.get_signal_pad_ptrs_dev      ##r   c                     | j         S r  )r  rO   s    r   get_buffer_ptrs_devz$SymmDeviceMemory.get_buffer_ptrs_dev  r  r   r`  c                     |t          | j                  k    r+t          d| dt          | j                  dz
   d          | j        |         }|S )+Get the raw unicast pointer to a given rankzRank z out of range (0-r   ))rA   r  
ValueError)rP   r`  data_ptrs      r   get_unicast_ptrz SymmDeviceMemory.get_unicast_ptr  s\    3t|$$$$TTTTC<M<MPQ<QTTTUUU<% r   c                 *    t          | j                  S zGet the raw multicast pointer)rE   r  rO   s    r   get_multicast_ptrz"SymmDeviceMemory.get_multicast_ptr"  s     4;r   c                     | j         S )z(Get the rank of this device in the group)r  rO   s    r   r   zSymmDeviceMemory.get_rank(  
    r   c                     | j         S )z,Get the total number of devices in the group)r  rO   s    r   r   zSymmDeviceMemory.get_world_size,  r  r   c                     | j         S )z4Get the total allocation size (including signal pad))r  rO   s    r   get_allocation_sizez$SymmDeviceMemory.get_allocation_size0  r  r   c                      | j         | j        z
  S )z1Get the usable buffer size (excluding signal pad))r  r  rO   s    r   get_usable_buffer_sizez'SymmDeviceMemory.get_usable_buffer_size4  s    #d&:::r   c                     |                                   |                     |          \  }}|                     |           |r|                     |           dS dS )z0Allocate multi-node multicast memory using MNNVLN)_verify_cuda_context_get_allocation_prop_allocate_unicast_buffers_setup_multicast)rP   r  r  r   mc_props        r   r  z$SymmDeviceMemory._alloc_mn_mcast_mem8  sr    !!### $(#<#<X#F#F  	&&777  	+!!'*****	+ 	+r   c                    	 t          t          j                              }t          |          | j        k    rt          d| d| j                    dS dS # t          $ r}t          d|            Y d}~dS d}~ww xY w)z1Verify CUDA context is set to the correct device.z'CUDA context device mismatch! Current: z, Expected: zError checking CUDA context: N)r   r   r  rE   r  r7   r8   )rP   rJ  r;   s      r   r  z%SymmDeviceMemory._verify_cuda_contextF  s    	7,T-@-B-BCCN>""do55knkkZ^Zikk     65  	7 	7 	75!55666666666	7s   AA 
A?"A::A?c                    t          j                    }| j        j        |_        t           j        j        |_        t          j                    |_	        t           j
        j        |j	        _        | j        |j	        _        d|j        _        t!          t          j        |t           j        j                            }t)          || j        z   |          | _        t          j                    }| j        |_        | j        |_        | j        j        |_        t!          t          j        |t           j        j                            | _        t)          | j        | j                  | _        ||fS )zCCompute allocation size and return allocation/multicast properties.r   ) r   r   r  r  r   r   r   r   r   r   r   r   r  r   
allocFlagsgpuDirectRDMACapabler   r  r   r   r   r  r  CUmulticastObjectPropr  
numDevicesr.   handleTypescuMulticastGetGranularityCUmulticastGranularity_flags$CU_MULTICAST_GRANULARITY_RECOMMENDED_mc_granularity)rP   r  r   alloc_granularityr  s        r   r  z%SymmDeviceMemory._get_allocation_propQ  s@   244/3/J,#7U#'#5#7#7 "> 	 % '+o #:;"7 ,.5Z 
 
  (t++-> 
  

 ,..!_+"o9  /*1V  
  
  ((<d>RSS''r   c           
      p   dg| j         z  | _        t          t          j        | j        |d                    | j        | j        <   t          t          j        | j        | j                 | j        j	        d                    }| j        
                    |          }t          j                     t          | j                   D ]g}|| j        k    rZt          t          j        ||         | j        j	                            | j        |<   | j                            ||                    hdg| j         z  | _        | j        | j         z  }|| _        t          t          j        || j        dd                    }|| _        t          | j                   D ]f}| j        |z  }t)          |          |z   | j        |<   t          t          j        | j        |         | j        d| j        |         d                     g|                                 }	t          t          j        |||	gd                     dS )zFAllocate local UC memory, exchange handles with peers, and map memory.r   r   N)r  r  r   r   r  r  r  r  r  r  rX   cuCtxSynchronizerA  r#  r  r  r  r  r$  r  rE   r"  _get_mem_access_descr$  )
rP   r   local_shareable_uc_handleall_shareable_uc_handlesr  r  r  r:  offsetaccess_descs
             r   r  z*SymmDeviceMemory._allocate_unicast_buffersz  sC    #/ ,;T1?AFF,
 ,
(
 %4-0+ %
 %
! $(?#<#<=V#W#W  t'' 	E 	EADO##%47033 & &" ''(@(CDDD sT_,,t>*%$]D4H!QOO
 
 ' t'' 	 	A)A-F!+..7DLOLOT%91doa>PRS     //11][M1MM	
 	
 	
 	
 	
r   c           
         | j         dk    rYt          t          j        |                    | _        t          t          j        | j        | j        j        d                    }nd}| j                            |d          }t          j	                     | j         dk    rKt          t          j
        || j        j                            | _        | j                            |           t          t          j        | j        | j                             t          t          j        | j        | j        dd                    | _        t          t          j        | j        | j        d| j        d                     |                                 }t          t          j        | j        | j        |gd                     t          t          j        | j        d| j        | j                  d| j        d                     dS )z?Create multicast object, exchange handle, map memory, and bind.r   Nr  r   )r  r   r   cuMulticastCreater  r  r  r  r  r'  r#  r  cuMulticastAddDevicer  r  r  r$  r  r"  r(  r$  cuMulticastBindMemr  )rP   r  shareable_mc_handler,  s       r   r  z!SymmDeviceMemory._setup_multicast  s    ?a,T-CG-L-LMMDN"11NO/ # # #' #o778KRS7TT ?a,3'O/  DN O##$7888 	1$.$/RRSSS &$T%94;OQRTUVV
 
 	M$+t';QPQRR	
 	
 	
 //11T-AK=RSTT	
 	
 	

 	#0$ 		
 		
 		
 		
 		
r   c                     t          j                    }t          j                    |_        t           j        j        |j        _        | j        |j        _        t           j	        j
        |_        |S )z0Create memory access descriptor for this device.)r   r  r   r   r   r   r   r  r   r  r   r  )rP   r,  s     r   r(  z%SymmDeviceMemory._get_mem_access_desc  sS    *,,#133$($:$V!"&/ 2Ur   r   c                 `   |t           j        k    s|t           j        k    rd}d}t          j        }n3|t           j        k    rd}d}t          j        }nt          d|           | j        | j	        z
  |z  }t           |t          | j        | j                           ||                     d S )Ni   r   l        r0   zUnsupported dtype: )r!   bfloat16float16r   cuMemsetD16float32cuMemsetD32r  r  r  r   rE   r  r  )rP   r`  r   neg_zerodsizememset_funcnum_elementss          r   lamport_initializez#SymmDeviceMemory.lamport_initialize  s    EN""eu}&<&<HE*KKem##!HE*KK:5::;;; ,t/CCMKDL9::HlSS	
 	
 	
 	
 	
r   )NTT)re   rf   rg   rh   rE   r	   rL   r\  r   r   r   r  r  r   r  r	  r  r   r   r  r  r  r  r  r  r  r(  r!   r   r=  r   r   r   r  r  i  sr       ;; CG!%%)^@ ^@^@ ^@ 	^@
 ^@ +3;*?^@ ^@ #^@ ^@ ^@ ^@@:G :G :Gx $s)        d3i    $ $ $ $ $ S        C C     3        #        $S $ $ $ $; ; ; ; ;+C +4 + + + +	7 	7 	7'(S '( '( '( '(R9
 9
 9
v6
 6
 6
p  
s 
5; 
 
 
 
 
 
r   r  c                       e Zd ZdZ	 ddedededej        dee         f
dZ	d	ed
ej
        fdZ	 dded
ej
        dedej        fdZ	 dded
ej
        dedej        fdZdefdZd	edefdZdefdZdS )McastGPUBufferz
    Wrapper class for SymmDeviceMemory to facilitate PyTorch tensor creation.
    It manages a buffer accessible via unicast or multicast for multi-node communication.

    Python port of McastGPUBuffer from TensorRT-LLM
    Nr  r  r  r   r  c                     t          ||||j        |          | _        | j                                        | _        || _        dS )a&  
        Constructor for McastGpuBuffer.

        Args:
            buf_size: The requested size of the buffer in bytes. The actual usable size may differ due to alignment requirements.
            group_size: The number of ranks in the communication group
            group_rank: The rank of the local process within the group
            device: The CUDA device for buffer allocation
            mn_nvlink: Flag indicating if multi-node NVLink is used
            comm_backend_for_handle_transfer: Communication backend for handle transfer
        N)r  indexmcast_device_memoryr  r  local_device)rP   r  r  r  r   r  s         r   r   zMcastGPUBuffer.__init__  sM    & $4L,$
 $
  0GGII"r   r`  r   c                 <    | j                             ||           d S rN   )rB  r=  )rP   r`  r   s      r   r=  z!McastGPUBuffer.lamport_initialize0  s!     33D%@@@@@r   r   sizesstorage_offsetr   c                      t          d          )a|  
        Returns a PyTorch tensor view of the multicast buffer portion.

        Args:
            sizes: The desired shape (dimensions) of the tensor
            dtype: The data type of the tensor elements
            storage_offset: The offset in elements from the start of the buffer

        Returns:
            A PyTorch tensor wrapping the multicast buffer section
        Not implemented yetNotImplementedErrorrP   rE  r   rF  s       r   get_multicast_bufferz#McastGPUBuffer.get_multicast_buffer3  s      ""7888r   c                      t          d          )zN
        Returns a PyTorch tensor view of the unicast buffer portion.
        rH  rI  rK  s       r   get_unicast_bufferz!McastGPUBuffer.get_unicast_bufferE  s     ""7888r   c                 4    | j                                         S r  )rB  r  rO   s    r   r  z McastGPUBuffer.get_multicast_ptrO  s    '99;;;r   c                 6    | j                             |          S )r  )rB  r	  )rP   r`  s     r   r	  zMcastGPUBuffer.get_unicast_ptrS  s    '77===r   c                 4    | j                                         S )z$Get the buffer pointers device array)rB  r  rO   s    r   r  z"McastGPUBuffer.get_buffer_ptrs_devW  s    ';;===r   rN   )r   )re   rf   rg   rh   rE   r!   r   r	   rL   r   r   r=  tupleTensorrL  rN  r  r	  r  r   r   r   r?  r?  
  s         CG# ## # 	#
 # +3;*?# # # #<As A5; A A A A GH9 99#(;9@C9	9 9 9 9& GH9 99#(;9@C9	9 9 9 9<3 < < < <>C >C > > > >>S > > > > > >r   r?  )>r?   r  r  rg  rv  r  rl  abcr   r   dataclassesr   r   r   typingr   r   r   r	   r
   r   r!   cuda.bindingsr   r   rm   r;   
cuda_utilsr   dlpack_utilsr   r   r   r   OMPI_COMM_TYPE_HOSTr  MNNVL_DEBUGrE   r   rR  r   rS  r-   r\  r<   rJ   rL   rl   rj   ro   rq   r   r   r   r   r_  r  r  r  r  r  r?  r   r   r   <module>r\     s      				        # # # # # # # # ! ! ! ! ! !  



 ; ; ; ; ; ; ; ; ; ; ; ; ; ;  ,,,,,,, 	 	 	 	   kI
 
 	 			 ) ( ( ( ( ( D D D D D D D D         *# *S *S * * * *
!	!!#(;!;>!
\! ! ! !H C C D    <49     (@ @ @ @ @# @ @ @,  > > >) ) ) ) ) ) ) ).       0M1 M1 M1 M1 M1{ M1 M1 M1` $ $ $ $ $ $ $ $) ) ) ) ) ) ) )F
`, `, `, `, `, `, `, `,F! ! ! ! !c ! ! !>    O   (/ / / / /_ / / /d# $    4^
 ^
 ^
 ^
 ^
 ^
 ^
 ^
BO> O> O> O> O> O> O> O> O> O>s6   
A A:AA:A4#A//A44A:9A: