
    )`i-              0       	   U d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ ddlZddlmZ ddlmZ d	d
lmZ d	dlmZmZ ddlmZmZm Z   G d d          Z! G d d          Z" G d d          Z# G d d          Z$ G d d          Z%ej&        d             Z'dZ(dZ)dZ* ed          	 dwde+de+de+d e
e         d!e	e	e+                  f
d"            Z,	 dwd#e	e	e+                  d e
e         d!dfd$Z-d%Z.d&Z/ ed'          	 	 	 	 	 dxd)e+de+de+d*e0d e
e         d+e0d,e
e         d-e0d!eee	e	e+                  ej1        f         ee	e	e+                  ej1        e2f         ee	e	e+                  ej1        e	e         e2f         f         fd.            Z3	 dwd#e	e	e+                  d e
e         d!dfd/Z4d0 Z5d1e+d2e+d3ej6        d!dfd4Z7d5e+d6e+d7e+d2e+d3ej6        d!dfd8Z8 ed9          d:ej1        d;ej1        de+d)e+d<e+d=e#d>e!d?e"d@e0dAe+dBej1        dCej1        dDej1        dEe
ej1                 dFe
ej1                 dGe
ej1                 dHe
ej1                 dIe
e9         dJe
ej1                 dKe
ej1                 dLe
ej1                 dMe
ej1                 d!df.dN            Z:dOdPdQdRZ;e2e+e+f         e<dS<   d<e+dTe+d3ej6        dUe+d!e0f
dVZ=d<e+dTe+dUe+d3ej6        dWe2d!dfdXZ> edY          	 dwdZej1        dUe+d[e+d<e+dTe+d\ej1        d@e0d]e0d^e0d_e$d`e
e0         dae
ej1                 dbe
ej1                 dce
ej1                 dde
ej1                 dee
ej1                 dfe
ej1                 dge
ej1                 dhe
e9         die
eej1        e9f                  dje
e%         dWe
e2         d!df.dk            Z?dUe+d[e+d<e+dTe+d\ej1        d@e0dbej1        dgej1        dhe9die9dle+dmej1        dnej1        doej1        dje
e%         dpe
ej1                 dce
ej1                 dde
ej1                 dee
ej1                 dfe
ej1                 d!df*dqZ@dZej1        dbej1        drej1        dsej1        ddej1        dcej1        d\ej1        d@e0d[e+dUe+dIe9dte
ej1                 due
ej1                 d!dfdvZAdS )ya3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)c_void_pcast)SimpleNamespace)ListOptionalTupleUnion)
deprecated)CommBackendSymmDeviceMemoryTorchDistBackend)ProcessGroup   )gen_trtllm_comm_module)register_custom_opround_up   )create_shared_buffercudartfree_shared_bufferc                   *    e Zd ZdZdZdZdZdZdZdZ	dS )	AllReduceStrategyTyper   r   r               N)
__name__
__module____qualname__NCCLMIN_LATENCYUBAUTOONESHOTTWOSHOTLOWPRECISION     m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/comm/trtllm_ar.pyr   r   "   s1        DK	
BDGGLLLr(   r   c                       e Zd ZdZdZdS )AllReduceStrategyConfigr   r   N)r   r   r   
USE_MEMCPY	PUSH_MODEr'   r(   r)   r+   r+   -   s        JIIIr(   r+   c                   6    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdS )AllReduceFusionOpr   r   r   r   r   r   r         	   N)r   r   r   NONERESIDUAL_RMS_NORMLAST_PROCESS_FOR_UBRESIDUAL_RMS_PREPOST_NORMRESIDUAL_RMS_NORM_QUANT_FP8RESIDUAL_RMS_NORM_QUANT_NVFP4RESIDUAL_RMS_NORM_OUT_QUANT_FP8!RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4MOE_ALLREDUCE_RESIDUAL_RMS_NORM(MOE_FINALIZE_ALLREDUCE_RESIDUAL_RMS_NORMr'   r(   r)   r/   r/   3   sK        D !"#$%!&'#()%&'#/0,,,r(   r/   c                   &    e Zd ZdZdZdZdZdZdZdS )AllReduceFusionPatternr   r   r   r   r   r   N)	r   r   r   
kAllReducekARResidualRMSNormkARResidualRMSNormFP8QuantkARResidualRMSNormFP4QuantkARResidualRMSNormOutFP8QuantkARResidualRMSNormOutFP4Quantr'   r(   r)   r>   r>   A   s5         J!"!"$%!$%!!!r(   r>   c                       e Zd ZdZdZdZdS )QuantizationSFLayoutr   r   r   N)r   r   r   SWIZZLED_128x4SWIZZLED_8x4LINEARr'   r(   r)   rF   rF   Q   s!         NL FFFr(   rF   c            0      
   t                                                      t          ddg          dt          dt          dt          j        dd ffd            } t          d	g d
          dt          dt          dt          dt          dt          j        dd ffd            }t          d          t          dg d          dt          j        dt          j        dt          dt          dt          dt          dt          dt          dt          dt          dt          j        dt          j        dt          j        dt          t          j                 d t          t          j                 d!t          t          j                 d"t          t          j                 d#t          t                   d$t          t          j                 d%t          t          j                 d&t          t          j                 d't          t          j                 dd f.fd(                        }t          d)g d*          d+t          j        d,t          d-t          dt          d.t          d/t          j        dt          d0t          d1t          d2t          d3t          d4t          t          j                 d5t          t          j                 d6t          t          j                 d7t          t          j                 d8t          t          j                 d9t          t          j                 d:t          t          j                 d;t          t                   d<t          t          t          j        t          f                  d=t          t                    dd f,fd>            }t          d?g d@          d,t          d-t          dt          d.t          d/t          j        dt          d5t          j        d:t          j        d;t          d<t          dAt          dBt          j        dCt          j        dDt          j        d=t          t                    dEt          t          j                 d6t          t          j                 d7t          t          j                 d8t          t          j                 d9t          t          j                 dd f*fdF            }t          dGd6d7g          d+t          j        d5t          j        dHt          j        dIt          j        d7t          j        d6t          j        dt          dJt          j        d-t          d,t          d#t          dKt          t          j                 dLt          t          j                 dd ffdM            }t#          | |||||N          S )ONz%flashinfer::trtllm_lamport_initializebuffer)mutates_args
buffer_ptrsizedtypereturnc                 6                         | ||           d S N)trtllm_lamport_initialize)rM   rN   rO   modules      r)   rS   z9get_trtllm_comm_module.<locals>.trtllm_lamport_initializeg   s#     	((T5AAAAAr(   z)flashinfer::trtllm_lamport_initialize_allbuffer_0_ptrbuffer_1_ptrbuffer_2_ptrrN   rO   rV   rW   rX   c                 :                         | ||||           d S rR   )trtllm_lamport_initialize_all)rV   rW   rX   rN   rO   rT   s        r)   rZ   z=get_trtllm_comm_module.<locals>.trtllm_lamport_initialize_allo   s2     	,,,dE	
 	
 	
 	
 	
r(   trtllm_create_ipc_workspace_for_all_reduce and trtllm_custom_all_reduce are deprecated and will be removed in the next major bump, use allreduce.py instead.z$flashinfer::trtllm_custom_all_reduceinpouttp_sizetp_rank	token_numfusion_op_codestrategy_codeconfig_codelaunch_with_pdl
flag_valuepeer_comm_buffer_ptrspeer_barrier_ptrs_inpeer_barrier_ptrs_outbiasresidualweightweight_pre_residual_normepsintermediate_bufferlamport_peer_comm_buffer_ptrs_0lamport_peer_comm_buffer_ptrs_1lamport_peer_comm_buffer_ptrs_2r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   c                 \                         | |||||||||	|
|||||||||||           d S rR   )trtllm_custom_all_reduce)r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rT   s                         r)   rt   z8get_trtllm_comm_module.<locals>.trtllm_custom_all_reduce~   sj    l 	''! !$+++-	
 	
 	
 	
 	
r(   z#flashinfer::trtllm_allreduce_fusionallreduce_in
world_size
world_rankra   
hidden_dimworkspace_ptrsre   use_oneshottrigger_completion_at_endfp32_accpattern_codeallreduce_outresidual_inresidual_outnorm_out	quant_out	scale_out	rms_gammarms_epsscale_factorlayout_coderv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   c                 Z                         | |||||||||	|
||||||||||           d S rR   )trtllm_allreduce_fusion)rv   rw   rx   ra   ry   rz   re   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   rT   s                        r)   r   z7get_trtllm_comm_module.<locals>.trtllm_allreduce_fusion   sg    b 	&&%+	
 	
 	
 	
 	
r(   z'flashinfer::trtllm_moe_allreduce_fusion)r^   r_   r`   ra   ry   rz   re   r   r   r   r    moe_reduction_device_num_expertsmoe_reduction_scale_input(moe_reduction_active_experts_token_inputmoe_reduction_token_inputr   r   r   r   r   r   r   r   r   r   moe_allreduce_outc                 X                         | |||||||||	|
|||||||||           d S rR   )trtllm_moe_allreduce_fusion)rw   rx   ra   ry   rz   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rT   s                       r)   r   z;get_trtllm_comm_module.<locals>.trtllm_moe_allreduce_fusion  sd    ` 	**,%4%)	
 	
 	
 	
 	
r(   z0flashinfer::trtllm_moe_finalize_allreduce_fusionnorm_weightexpanded_idx_to_permuted_idx	workspaceshared_expert_outputexpert_scale_factorc                 J                         | |||||||||	|
||           d S rR   )$trtllm_moe_finalize_allreduce_fusion)rv   r   r   r   r   r   re   r   rx   rw   rn   r   r   rT   s                r)   r   zDget_trtllm_comm_module.<locals>.trtllm_moe_finalize_allreduce_fusion]  sN    & 	33( 	
 	
 	
 	
 	
r(   )rS   rZ   rt   r   r   r   )r   build_and_loadr   inttorchrO   r
   Tensorr/   r   r+   boolr   floatr>   r	   rF   r   )rS   rZ   rt   r   r   r   rT   s         @r)   get_trtllm_comm_moduler   c   s~   #%%4466F/xj  BB"B+0;B	B B B B B B
 3VVV  	
	
	
 	
 		

 {	
 
	
 	
 	
 	
 	
	 	
  	g  .
 
 
  6/
\/
\/
 /
 	/

 /
 */
 -/
 -/
 /
 /
  %|/
 $l/
  %|/
 u|$/
 5<(/
  &!/
" #+5<"8#/
$ e_%/
& &el3'/
( *2%,)?)/
* *2%,)?+/
, *2%,)?-/
. 
//
 /
 /
 /
 /
7  </
b -
 
 
  4-
l-
-
 -
 	-

 -
 -
 -
 -
 $(-
 -
 --
  --
 el+-
 u|,-
 5<(-
  EL)!-
" EL)#-
$ EL)%-
& %'-
( uU\5%89:)-
* 23+-
, 
--
 -
 -
 -
 -
5 4-
^ 1
 
 
  4+
+
+
 +
 	+

 +
 +
 \+
 <+
 +
 +
 +.+
 $)<+
 38,+
 $)<+
 23+
  $EL1!+
" u|,#+
$ 5<(%+
& EL)'+
( EL))+
* 
++
 +
 +
 +
 +
5 4+
Z :$j1  
l
\
 \
 ',l	

 ,
 l
 
 <
 
 
 
 'u|4
 &el3
 

 
 
 
 
	 
> ";&C!9 7$?-Q   r(            r[   rankr_   max_token_numgrouprP   c                    ||z  |z  dz  }t           dz   dz  }||z  dz  }|t          z  |z  |z  dz  }t                      }	|||||||fD ]5}
t          |
d          }|	                    t          ||                     6t          d|  dd |	D                         t          |	d         |          |	d         |          |	d	         |          |dz  t          j	                   t          j        |
           |	S )ax  
    Parameters:
    - rank: the rank of the current process.
    - tp_size: the size of the process group.
    - max_token_num: the maximum number of tokens in a sequence.
    - hidden_dim: the dimension of the hidden states.
    - group: the process group to use.

    Note:
    This function is used to create a workspace for all reduce.
    The workspace is a list of IPC handles.
    The workspace should be initialized before calling trtllm_custom_all_reduce.
    The workspace should be destroyed after calling trtllm_custom_all_reduce.
    The workspace can be reused for multiple all reduce calls under the same configuration.

    We would init 7 IPC buffers for trtllm_custom_all_reduce.
    They are sized as follows:
    [buffer_size, buffer_size, flag_size, flag_size, lamport_buffer_size, lamport_buffer_size, lamport_buffer_size]
    where:
    - buffer_size: tp_size * max_token_num * hidden_dim * sizeof(float) * (maxBeamWidth)
    - flag_size: (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t) * tp_size * 2
    - lamport_buffer_size: tp_size * LamportTokenNumThreshold * tp_size * hidden_dim * sizeof(half)

    They are for:
    ipcHandles[0] - peer_comm_buffer_ptrs
    ipcHandles[2] - peer_barrier_ptrs_in
    ipcHandles[3] - peer_barrier_ptrs_out
    ipcHandles[4] - lamport_peer_comm_buffer_ptrs[0:tp_size]
    ipcHandles[5] - lamport_peer_comm_buffer_ptrs[tp_size:tp_size * 2]
    ipcHandles[6] - lamport_peer_comm_buffer_ptrs[tp_size * 2:tp_size * 3]

    We use tp_size and world_size here interchangeably (customAllReduce).

    Reference: trtllm, cpp/tests/unit_tests/kernels/allReduce/allReduceKernelTest.cu, Workspace init
    r   r   r       rank  allocated ipc_handles: c                 &    g | ]}d  |D             S )c                 ,    g | ]}t          |          S r'   hex.0handles     r)   
<listcomp>zItrtllm_create_ipc_workspace_for_all_reduce.<locals>.<listcomp>.<listcomp>  s    /R/R/RF/R/R/Rr(   r'   r   sublists     r)   r   z>trtllm_create_ipc_workspace_for_all_reduce.<locals>.<listcomp>  s(    .n.n.nW^/R/R'/R/R/R.n.n.nr(   r   r   r   )MAX_ALL_REDUCE_BLOCKSLamportTokenNumThresholdlistr   appendr   printrZ   r   float16distbarrier)r   r_   r   ry   r   buffer_size	FLAG_SIZE	flag_sizelamport_buffer_sizeipc_handlesrN   aligned_sizes               r)   *trtllm_create_ipc_workspace_for_all_reducer     sN   \ M)J6:K&*a/IG#a'I!$<<wFSVWW&&K 	 F F  g../eDDEEEE	ppp.n.nbm.n.n.npp   "AtAtAtq    	Lur(   r   c                 0    | D ]}t          ||           dS )a&  
    Note:
    This function is used to destroy a workspace for all reduce.
    The workspace is a list of IPC handles.
    The workspace should be destroyed after calling trtllm_custom_all_reduce.
    The workspace can be reused for multiple all reduce calls under the same configuration.
    Nr   r   r   
ipc_handles      r)   +trtllm_destroy_ipc_workspace_for_all_reducer     s0       . .
:u----. .r(      i  ztuse the unified API allreduce.py instead. It will internally call trtllm_create_ipc_workspace_for_all_reduce_fusion.Fr`   use_fp32_lamportcreate_metadatacomm_backenduse_symm_dev_memc	                    ||rt          |          }|r|st          d          ||z  |z  dz  }	|t          z  dz  }
|s||z  |z  dz  n
||z  |z  dz  }|t          k    r't	          j        d| dt           d           t          }|d	z  }t                      }t                      }|	|
|fD ]}t          |d
          }|s$|                    t          ||                     8t          ||| t          j        d|           j        |dd          }|                    |j                   |                    |           t          d|  dd |D                         t          |d
          }|r+t!          |d         |          |dz  t          j                   n*t!          |d         |          |dz  t          j                   t                      }|D ]/}t'          |          D ]}|                    ||                    0	 t)          j        d          }t)          j        |dd           |                    dd          }t)          j        t3          |j        dz             t7          |t2                    d           t          d|           |                    |j                   t'          t9          |                    D ]-}t          d|  d| dt;          ||                               .t          j        |t          j        t          j        d                    }|r|                                  ntC          j         |           |r| |||||	|
||d	}|r||||fS |||fS ||fS )a  
    Parameters:
    - tp_rank: the rank of the current process.
    - tp_size: the size of the process group.
    - max_token_num: the maximum number of tokens in a sequence.
    - hidden_dim: the dimension of the hidden states.
    - use_fp32_lamport: if True, we will use fp32 datatype in allreduce fusion.
    - group: the process group to use.
    - create_metadata: if True, return metadata dict as third element (default: False).
    - comm_backend: the communication backend to use.
    - use_symm_dev_mem: if True, we will use symmetric device memory for the workspace.

    Returns:
    - If create_metadata=False: (ipc_handles, workspace_tensor)
    - If create_metadata=True: and use_symm_dev_mem=False: (ipc_handles, workspace_tensor, metadata)
      where metadata contains: tp_rank, tp_size, max_token_num, hidden_dim,
      use_fp32_lamport, buffer_size, flag_size, lamport_comm_size, lamport_buffer_size
    - If create_metadata=True: and use_symm_dev_mem=True: (ipc_handles, workspace_tensor, mem_handles,metadata)
      where metadata contains: tp_rank, tp_size, max_token_num, hidden_dim,
      use_fp32_lamport, buffer_size, flag_size, lamport_comm_size, lamport_buffer_size
      and mem_handles is a list of SymmDeviceMemory objects.

    Note: The optional parameters make the API clunky at this time. This will be refactored in the future, at the cost of backward compatibility, where the default behavior will be
    create_metadata=True and use_symm_dev_mem=True.

    Note:
    We would init 3 IPC buffers for trtllm_custom_all_reduce_fusion.
    They are sized as follows:
    [buffer_size, flag_size, lamport_buffer_size * 3]
    where:
    - buffer_size: tp_size * max_token_num * hidden_dim * sizeof(half)
    - flag_size: tp_size * BarrierFlagCount * sizeof(int)
    - lamport_buffer_size: tp_size * max_token_num * tp_size * hidden_dim * sizeof(half)
      where sizeof(elem) = 2 (fp16/bf16) or 4 (fp32 when use_fp32_lamport=True)
    The workspace is passed as workspace field in AllReduceFusionParams.

    We use tp_size and world_size here interchangeably (allReduceFusion).

    Reference: trtllm, cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu, Workspace init
    Nr   z<use_symm_dev_mem is only supported when create_metadata=Truer   r   zwarning: lamport_comm_size  is greater than MAX_COMM_SIZE z, set to MAX_COMM_SIZEr   r   cudaF)enable_multicastallocate_signal_padsr   r   c                 &    g | ]}d  |D             S )c                 ,    g | ]}t          |          S r'   r   r   s     r)   r   zPtrtllm_create_ipc_workspace_for_all_reduce_fusion.<locals>.<listcomp>.<listcomp>e  s    2U2U2U63v;;2U2U2Ur(   r'   r   s     r)   r   zEtrtllm_create_ipc_workspace_for_all_reduce_fusion.<locals>.<listcomp>e  s(    1q1q1qZa2U2UW2U2U2U1q1q1qr(      r   little)	byteorder   z%set flag_ptr[3] = lamport_comm_size: zRank z workspace[z] rO   device)	r`   r_   r   ry   r   r   r   lamport_comm_sizer   )"r   
ValueErrorBarrierFlagCountMAX_COMM_SIZEloggingwarningr   r   r   r   r   r   r   indexuc_ptrsr   rS   float32r   ranger   
cudaMalloc
cudaMemsetto_bytes
cudaMemcpyr   valuer   lenr   tensorint64r   r   )r`   r_   r   ry   r   r   r   r   r   r   r   r   r   r   mem_handlesrN   r   symm_memaligned_lamport_buffer_sizer   r   r   flag_ptrlamport_comm_size_bytesiworkspace_tensormetadatas                              r)   1trtllm_create_ipc_workspace_for_all_reduce_fusionr     s   v  0'e444  Y YWXXXM)J6:K**Q.I
  	6-*,q00}$z1A5 
 =(( B*;  B  B\i  B  B  B	
 	
 	
 *+a/
 $(66K*.&&Ki)<= ) )  g.. 	)3L%HHIIII'VW--3!&%*  H x/000x((((	sss1q1qep1q1q1qss  
 #++>"H"H 
!N7#%@A%Eu}	
 	
 	
 	
 	"N7#%@A%Eu}	
 	
 	

 I! / /
'NN 	/ 	/DZ-....	/  ''H
h5)))/88h8OO
%'(($/F*Q*QST   

13DEEEX^$$$3y>>"" D DBgBB!BBs9Q</@/@BBCCCC |U\&-A-A    "5!!!! -*$ 0&"!2#6

 

  	; 0+xGG 0(:: ,,,r(   c                 0    | D ]}t          ||           dS )a  
    Parameters:
    - workspace: the workspace to destroy.
    - group: the process group to use.

    Note:
    This function is used to destroy a workspace for all reduce fusion.
    The workspace is a list of IPC handles.
    The workspace should be destroyed after calling trtllm_custom_all_reduce_fusion.
    The workspace can be reused for multiple all reduce fusion calls under the same configuration.
    Nr   r   s      r)   2trtllm_destroy_ipc_workspace_for_all_reduce_fusionr     s0       . .
:u----. .r(   c                 B    d } || d          } ||d          }||z  S )z
    Helper function to compute the padded size of the fp4 swizzled layout.

    Parameters:
    - total_row: the total number of rows.
    - total_column: the total number of columns.
    c                     | |z   dz
  |z  |z  S )Nr   r'   )xys     r)   pad_upz3compute_fp4_swizzled_layout_sf_size.<locals>.pad_up  s    Qq A%%r(   r   r   r'   )	total_rowtotal_columnr   
padded_rowpadded_columns        r)   #compute_fp4_swizzled_layout_sf_sizer    s@    & & & 	3''JF<++M%%r(   rM   rN   rO   c                 L    t                                          | ||           d S rR   )r   rS   )rM   rN   rO   s      r)   rS   rS     s&    66z4OOOOOr(   rV   rW   rX   c                 P    t                                          | ||||           dS )a7  
    Initialize 3 lamport buffers by negative zero.

    Parameters:
    - buffer_0_ptr: the pointer to the first buffer.
    - buffer_1_ptr: the pointer to the second buffer.
    - buffer_2_ptr: the pointer to the third buffer.
    - size: the size of the buffer.
    - dtype: the data type of the buffer.
    N)r   rZ   rU   s        r)   rZ   rZ     s7    $ ::lL$    r(   ztrtllm_create_ipc_workspace_for_all_reduce and trtllm_custom_all_reduce are deprecated, use trtllm_create_ipc_workspace_for_all_reduce_fusion and trtllm_allreduce_fusion insteadr]   r^   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   c                 r    t                                          | |||||||||	|
|||||||||||           dS )a  
    Parameters:
    - inp: the input tensor. [token_num, hidden_dim]
    - out: the output tensor. [token_num, hidden_dim]
    - tp_size: the size of the process group.
    - tp_rank: the rank of the current process.
    - token_num: the number of tokens in the sequence.
    - fusion_op_code: the fusion operation code.
    - strategy_code: the strategy code.
    - config_code: the config code.
    - launch_with_pdl: whether to launch with pdl.
    - flag_value: the flag value.
    - peer_comm_buffer_ptrs: the peer communication buffer pointers.
    - peer_barrier_ptrs_in: the peer barrier pointers in.
    - peer_barrier_ptrs_out: the peer barrier pointers out.
    - bias: the bias tensor. [hidden_dim]
    - residual: the residual tensor. [token_num, hidden_dim]
    - weight: the weight tensor. [hidden_dim]
    - weight_pre_residual_norm: the weight pre residual norm tensor. [hidden_dim]
    - eps: the epsilon value.
    - intermediate_buffer: the intermediate buffer tensor.
    - lamport_peer_comm_buffer_ptrs_0: the lamport peer communication buffer pointers 0.
    - lamport_peer_comm_buffer_ptrs_1: the lamport peer communication buffer pointers 1.
    - lamport_peer_comm_buffer_ptrs_2: the lamport peer communication buffer pointers 2.
    N)r   rt   r\   s                         r)   rt   rt     so    j 55 '''-    r(   i   @   *   )r   r   r1   _use_oneshot_heuristicsry   rw   c                 V    | |z  dz  |z  |j         z  dz  dz  }|t          |         k    S )Nr   i   )itemsizer  )ra   ry   rO   rw   comm_size_mbs        r)   _should_use_oneshotr  K  s?     	J"Z/%.@4G$N  2:>>>r(   r   c                    g }g d}|D ]}||vr|                     d|            |r1dd                    d |D                       z   }t          |          ||d         k    r+|                     d| d|d          d	|d          d
           | |z  |d         |d         z  k    r.|                     d|  d| d|d          d|d          d	           |d         |t          j        k    k    r+|                     d|d          d| d|d          d
           |r1dd                    d |D                       z   }t          |          d S )N)r   r_   ry   r   z,Workspace metadata is missing required key: z&Workspace metadata validation failed:

c              3       K   | ]	}d | V  
dS z  - Nr'   r   es     r)   	<genexpr>zCcheck_trtllm_allreduce_fusion_workspace_metadata.<locals>.<genexpr>a  sA       J
 J
J1JJJ
 J
 J
 J
 J
 J
r(   r_   zworld_size (z$) does not match workspace tp_size (z%). Workspace was created for tp_size=.r   ry   ztoken_num (z) * hidden_dim (z#) exceeds workspace max_token_num (z(). This may cause Illegal Memory Access.r   zuse_fp32_lamport (z%) does not match allreduce_in.dtype (z.). Workspace was created for use_fp32_lamport=zWorkspace validation failed:
c              3       K   | ]	}d | V  
dS r  r'   r  s     r)   r  zCcheck_trtllm_allreduce_fusion_workspace_metadata.<locals>.<genexpr>{  sA       A
 A
J1JJA
 A
 A
 A
 A
 A
r(   )r   joinr   r   r   )	ra   ry   rw   rO   r   errorsrequired_keyskey	error_msgs	            r)   0check_trtllm_allreduce_fusion_workspace_metadatar  T  si    FRRRM P PhMMNNNOOO $=		 J
 J
 &J
 J
 J
 A
 A
 
	 ### Xi(((H: H H8T]K^ H H19)1DH H H	
 	
 	
 : 9H\<R RRR5) 5 5Z 5 5dlm|d} 5 5  PX  Ye  Pf 5 5 5	
 	
 	
 "#(>??Z*<!= Z Zdi Z Z:BCU:VZ Z Z	
 	
 	
  $4tyy A
 A
 &A
 A
 A
 8
 8
 
	 ###	$ $r(   zZuse the unified API allreduce.py instead. It will internally call trtllm_allreduce_fusion.rv   rx   rz   r|   r}   r~   r{   r   r   r   r   r   r   r   r   r   r   c                 v   |t          |||| j        |           |
t          ||| j        |          }
|
s||k    s
J d            | j        t          j        k    r||z  dz  |z  n
||z  dz  |z  }|t
          k    r$|
r"t          j        d| dt
           d           d}
|at          |t          j	                  r |
                    t          j                  }n't          j        |gt          j        | j        	          }t                                          | |||||||
|||	||||||||||
           dS )a.  
    Parameters:
    - allreduce_in: the input tensor. [token_num, hidden_dim]
    - world_size: the size of the process group.
    - world_rank: the rank of the current process.
    - token_num: the number of tokens in the sequence.
    - hidden_dim: the dimension of the hidden states.
    - workspace_ptrs: the workspace pointers.
    - launch_with_pdl: whether to launch with pdl.
    - use_oneshot: whether to use oneshot. If None, internal heuristics will be used.
    - trigger_completion_at_end: whether to trigger completion at the end.
    - fp32_acc: whether to use fp32 accumulation.
    - pattern_code: the pattern code.
    - allreduce_out: the output tensor. [token_num, hidden_dim]
    - residual_in: the residual input tensor. [token_num, hidden_dim]
    - residual_out: the residual output tensor. [token_num, hidden_dim]
    - norm_out: the norm output tensor. [token_num, hidden_dim]
    - quant_out: the quant output tensor. [token_num, hidden_dim]
    - scale_out: the scale output tensor. Initialization referece: tests/comm/test_trtllm_allreduce_fusion.py
    - rms_gamma: the rms gamma tensor. [hidden_dim]
    - rms_eps: the rms epsilon value.
    - scale_factor: the scale factor. For cudaGraphs safety, it should be a tensor.
    - layout_code: the layout code.
    - metadata: optional workspace metadata dict from create_ipc_workspace_for_all_reduce_fusion.
                If provided, validates that token_num <= max_token_num, world_size == tp_size,
                and hidden_dim == workspace hidden_dim. Raises ValueError if validation fails.
    Nz-sequence length should be larger than tp_sizer   r   required_lamport_comm_size r   ". Cannot use oneshot in this case.Fr   ru   )r  rO   r  r   r   r   r   r   
isinstancer   tor   r   r   r   )rv   rw   rx   ra   ry   rz   re   r|   r}   r~   r{   r   r   r   r   r   r   r   r   r   r   r   required_lamport_comm_sizes                          r)   r   r     s   p 8z:|/A8	
 	
 	
 )z<#5z
 
  W:%%%'V%%% .. 	J"Z//#a'*4  "M11k1 W*D  W  Wer  W  W  W	
 	
 	
 lEL11 	'??5=99LL <emL<O  L 44!%'";!#!!+ 5     r(   r   r   r   r   r   c                     |                                 dz  | z  }|t          k    rt          d| dt           d          t                                          | |||||||||	|
|||||||||           dS )a&  
    Parameters:
    - world_size: the size of the process group.
    - world_rank: the rank of the current process.
    - token_num: the number of tokens in the sequence.
    - hidden_dim: the dimension of the hidden states.
    - workspace_ptrs: the workspace pointers.
    - launch_with_pdl: whether to launch with pdl.
    - residual_in: the residual input tensor. [token_num, hidden_dim]
    - rms_gamma: the rms gamma tensor. [hidden_dim]
    - rms_eps: the rms epsilon value.
    - scale_factor: the scale factor.
    - moe_reduction_device_num_experts: the number of experts.
    - moe_reduction_scale_input: the scale input tensor. [token_num, hidden_dim]
    - moe_reduction_active_experts_token_input: the active experts token input tensor. [token_num, hidden_dim]
    - moe_reduction_token_input: the token input tensor. [token_num, hidden_dim]
    - layout_code: the layout code.
    - moe_allreduce_out: the moe allreduce output tensor. [token_num, hidden_dim]
    - residual_out: the residual output tensor. [token_num, hidden_dim]
    - norm_out: the norm output tensor. [token_num, hidden_dim]
    - quant_out: the quant output tensor. [token_num // 4, hidden_dim], fp16/bf16 -> fp4
    - scale_out: the scale output tensor. Initialization referece: tests/comm/test_trtllm_moe_allreduce_fusion.py
    r   r  r   r  )rw   rx   ra   ry   rz   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   N)numelr   r   r   r   )rw   rx   ra   ry   rz   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                        r)   r   r     s    \ ";!@!@!B!BQ!F!S "M11 W*D  W  Wer  W  W  W
 
 	
 88%'!)I";1Y";+!) 9     r(   r   r   r   r   c                     |                                  dz  |	z  }|t          k    rt          d| dt           d          t                                          | |||||||||	|
||           dS )aJ  
    Parameters:
    - allreduce_in: the input tensor. [token_num, top_k, hidden_dim]
    - residual_in: the residual input tensor. [token_num, hidden_dim]
    - norm_weight: the norm weight tensor. [hidden_dim]
    - expanded_idx_to_permuted_idx: the expanded index to permuted index tensor. [token_num, top_k]
    - norm_out: the norm output tensor. [token_num, hidden_dim]
    - residual_out: the residual output tensor. [token_num, hidden_dim]
    - workspace_ptrs: the workspace pointers.
    - launch_with_pdl: whether to launch with pdl.
    - world_rank: the rank of the current process.
    - world_size: the size of the process group.
    - eps: the epsilon value.
    - shared_expert_output: the shared expert output tensor. [token_num, hidden_dim]
    - expert_scale_factor: the expert scale factor tensor. [token_num, top_k]
    r   r  r   r  )rv   r   r   r   r   r   r   re   rx   rw   rn   r   r   N)r"  r   r   r   r   )rv   r   r   r   r   r   rz   re   rx   rw   rn   r   r   r   s                 r)   r   r   ?  s    @ ".!3!3!5!5!9J!F "M11 W*D  W  Wer  W  W  W
 
 	
 AA!%A! '1/ B     r(   rR   )FNFNF)B__doc__	functoolsr   ctypesr   r   typesr   typingr   r   r   r	   typing_extensionsr
   flashinfer.comm.mnnvlr   r   r   r   torch.distributeddistributedr   r   jit.commr   utilsr   r   cuda_ipcr   r   r   r   r+   r/   r>   rF   cacher   OneShotMaxTokenr   r   r   r   r   r   r   r   r   dictr   r   r  rO   rS   rZ   r   rt   r  __annotations__r  r  r   r   r   r'   r(   r)   <module>r4     s
           ! ! ! ! ! ! ! ! ! ! ! ! ! ! / / / / / / / / / / / / ( ( ( ( ( ( Q Q Q Q Q Q Q Q Q Q              * * * * * * - - - - - - 0 0 0 0 0 0 0 0 F F F F F F F F F F              1 1 1 1 1 1 1 1& & & & & & & &        $ c c cP	     c  %)M M
MM M
 L!M 
$s)_M M M Mb AE. .DI.'/'=.	. . . .  - z  #$(!*."u- u-u-u- u-
 u- L!u- u- ;'u- u- 	$tCy/5<
'(	$tCy/5<
-.	$tCy/5<.>)?
EFHu- u- u- u-r AE. .DI.'/'=.	. . . .(& & &"P# PS P PQU P P P P  	
 ; 
   .  x I	I	I I 	I
 I &I )I )I I I !<I  ,I !<I 5<
 I u|$I  U\"!I" 'u|4#I$ 
%%I& "%,/'I( &.el%;)I* &.el%;+I, &.el%;-I. 
/I I I I^ 		+ + c3h   ?? #?,1K?EH?	? ? ? ?*$*$*$ *$ ;	*$
 *$ 
*$ *$ *$ *$Z ` 2  $-j j,jj j 	j
 j Lj j  $j j )j $j EL)j %,'j 5<(j u|$j  %!j" %#j$ %%j& e_'j( 5u!456)j* ./+j, tn-j. 
/j j j jZKKK K 	K
 LK K K |K K K '*K  %|K /4lK  %|K ./K   -!K" 5<(#K$ u|$%K& %'K( %)K* 
+K K K K\6,66 6 #(,	6
 l6 ,6 L6 6 6 6 
6 #5<06 "%,/6 
6 6 6 6 6 6r(   