
    )`ie              $       h   d Z 	 ddlmZmZmZmZmZmZmZ ddl	m
Z
 ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d de
          Zdedededededej         de!fdZ"dedededededej         de!fdZ#de$e         dedededededej         de$e         fdZ%e	 	 	 	 	 	 	 	 	 d7ded          dededededej         d!ed"ee         d#e!de
fd$            Z&e	 	 	 	 	 	 	 	 	 	 	 	 	 d8d&ej'        d'e
d(ed)e!d*eej'                 d+eej'                 d,eej'                 d-eej'                 d.eej'                 d/eej'                 d0eej'                 d1e(d2eeej'        e(f                  d3ee         d4ee!         d5e!dej'        f"d6            Z)dS )9a3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    )UnionLiteralOptionalTupleListcastAny   )AllReduceFusionWorkspaceN)flashinfer_api)trtllm_allreduce_fusion)1trtllm_create_ipc_workspace_for_all_reduce_fusion)0check_trtllm_allreduce_fusion_workspace_metadata)Mapping)CommBackendSymmDeviceMemory)AllReduceFusionPattern)MNNVLAllReduceFusionWorkspace)MNNVLAllreduceFusionStrategy)trtllm_mnnvl_allreduce)(trtllm_mnnvl_fused_allreduce_add_rmsnormc                        e Zd ZdZej        dfdededededej        dee	         f fd	Z
ed
efd            Zd Z	 ddedededej        dee         d
efdZddZ xZS )TRTLLMAllReduceFusionWorkspacez,TensorRT-LLM workspace for AllReduce fusion.Ntp_sizetp_rankmax_token_num
hidden_dimdtypecomm_backendc           
         t                                          ||           t          |||||d|t          j        k    d          | _        t          t          t          t          t                            t          j
        t          t                   t          f         | j                  }|d         | _        |d         | _        |d         | _        |d         | _        dS )a  
        Create TensorRT-LLM AllReduce fusion workspace.

        Args:
            tp_size: Tensor parallel size (world size)
            tp_rank: Tensor parallel rank
            max_token_num: Maximum number of tokens
            hidden_dim: Hidden dimension size
            dtype: Data type
            comm_backend: Communication backend
            **kwargs: Additional arguments for workspace creation
        T)r   r   r   r   r   create_metadatause_fp32_lamportuse_symm_dev_memr   r
         N)super__init__r   torchfloat32_internal_workspacer   r   r   intTensorr   dictipc_handlesworkspace_tensormem_handlesmetadata)	selfr   r   r   r   r   r   workspace_tuple	__class__s	           m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/comm/allreduce.pyr'   z'TRTLLMAllReduceFusionWorkspace.__init__]   s    * 	'*** $U'!% "em3!	$
 	$
 	$
  $tCy/5<6F1GMN$
 
 +1- / 2*1-'*    returnc                     dS )Ntrtllm r2   s    r5   backendz&TRTLLMAllReduceFusionWorkspace.backend   s    xr6   c                     |                     d          r(t          dt          |           j         d| d          t	          | j        |          S )z=Delegate attribute access to internal workspace if not found._'z' object has no attribute ')
startswithAttributeErrortype__name__getattrr*   )r2   names     r5   __getattr__z*TRTLLMAllReduceFusionWorkspace.__getattr__   s\    ??3 	 KDJJ'KKDKKK   t/666r6   
num_tokensuse_oneshotc                     	 t          ||||| j                   dS # t          $ r}t          d|            Y d }~dS d }~ww xY w)NTz,Workspace is insufficient for problem size. F)r   r1   
ValueErrorprint)r2   r   rG   r   r   rH   es          r5   is_buffer_size_sufficientz8TRTLLMAllReduceFusionWorkspace.is_buffer_size_sufficient   sp    	<J   4 	 	 	DDDEEE55555	s    
A>Ac                 J    t          | dd          rdS | `| `| `| `d| _        dS )z%Destroy workspace and free resources.
_destroyedFNT)rD   r.   r/   r0   r1   rO   r;   s    r5   destroyz&TRTLLMAllReduceFusionWorkspace.destroy   s=    4u-- 	F!Mr6   )N)r7   N)rC   
__module____qualname____doc__r(   float16r+   r   r   r   r'   propertystrr<   rF   r	   boolrM   rP   __classcell__)r4   s   @r5   r   r   Z   s<       66 #].2,+ ,+,+ ,+ 	,+
 ,+ {,+ {+,+ ,+ ,+ ,+ ,+ ,+\     X7 7 7 &*   	
 { c] 
   "	 	 	 	 	 	 	 	r6   r   r<   
world_sizerankr   r   r   r7   c                     |dk    S )z}
    Check if trtllm backend CAN be used for workspace creation.

    Hard requirements:
    - Up to 16 ranks supported.
       r:   r<   rY   rZ   r   r   r   s         r5   _trtllm_workspace_checkr^      s     r6   c                     dS )zE
    Check if mnnvl backend CAN be used for workspace creation.

    Tr:   r]   s         r5   _mnnvl_workspace_checkr`      s	     4r6   suitable_backendsc                 T    | sg S t          |           dk    r| S d| v rdgS | d         gS )a  
    Select best backend for workspace creation based on performance.

    Called by decorator after checking which backends pass requirements.
    Uses benchmarking data to pick fastest option.

    Args:
        suitable_backends: List of backends that passed hard requirement checks
        backend: Requested backend ("auto", "trtllm", or "mnnvl")
        world_size: Number of ranks
        rank: Current rank
        max_token_num: Maximum number of tokens
        hidden_dim: Hidden dimension size
        dtype: Data type
        **kwargs: Additional arguments

    Note that at this point, the backend selection does not take "runtime parameters" into account, such as layout_code, and fusion pattern.

    Returns:
        List containing the selected backend (single element)
    r
   mnnvlr   )lenra   r<   rY   rZ   r   r   r   s          r5   _workspace_creation_heuristicrf      sO    <  	
""   ###y!!$%%r6   autoF)r9   rc   rg   gpus_per_noder   force_oneshot_supportc	           	      |   |,t          t          j                                        |          }| dk    rg }	t	          | |||||          r|	                    d           t          | |||||          r|	                    d           |	st          d          t          |	| |||||          }
|
d         }n| }|dk    rt          ||||||	          S |dk    rNt          ||||
          }d}|r"t          j        ||||t          j                  }t          ||||||          S t          d|           )u  
    Create workspace for AllReduce fusion operations.

    Backend selection uses topology-based checks and heuristics.

    **Important: Workspace Reusability**
    The workspace is allocated based on the total size (max_token_num * hidden_dim * dtype_size).
    You can reuse the same workspace with different shapes as long as the total size fits:

    - Workspace(max_token_num=2048, hidden_dim=4096) can handle:
      - (token_num=2048, hidden_dim=4096) ✓
      - (token_num=1024, hidden_dim=4096) ✓
      - (token_num=4096, hidden_dim=2048) ✓ (same total size)
      - (token_num=1024, hidden_dim=8192) ✓ (same total size)
      - (token_num=4096, hidden_dim=4096) ✗ (too large)

    Use `workspace.is_buffer_size_sufficient(token_num, hidden_dim, dtype)` to check before use.

    Args:
        backend: Backend to use ("trtllm", "mnnvl", or "auto")
                 "auto" uses heuristic to select best backend
        world_size: Number of ranks in the process group
        rank: Current rank ID
        max_token_num: Maximum number of tokens to support
        hidden_dim: Hidden dimension size
        dtype: Data type for communication tensors
        gpus_per_node: Number of GPUs per node (for multi-node topology).
        comm_backend: Communication backend to use.
        force_oneshot_support: Allocate workspace for oneshot strategy vs twoshot
                    True: Allocate workspace for oneshot strategy up to the largest problem size requested
                    False: Allocate workspace for twoshot strategy for all problem sizes, and for oneshot strategy up to the heuristic threshold.
                    Note that only the workspace for MNNVL backend needs to be initialized with the correct strategy.
                    The trtllm backend will be sufficient for both strategies.

    Returns:
        Workspace object (TRTLLMAllReduceFusionWorkspace or MNNVLAllReduceFusionWorkspace)
        The workspace type determines which backend will be used in allreduce_fusion()

    Raises:
        BackendSupportedError: If no suitable backend available for the configuration
        ValueError: If problem size not supported for the specified backend

    Examples:
        >>> # Auto-select best backend
        >>> workspace = create_allreduce_fusion_workspace(
        ...     backend="auto",
        ...     world_size=8,
        ...     rank=0,
        ...     max_token_num=2048,
        ...     hidden_dim=4096,
        ...     dtype=torch.bfloat16,
        ... )
        >>> print(workspace.backend)  # "trtllm"
        >>> print(workspace.get_workspace_capacity())  # 8388608 elements

        >>> # Check if workspace can handle different problem sizes
        >>> workspace.is_buffer_size_sufficient(1024, 4096, 8, torch.bfloat16)  # True
        >>> workspace.is_buffer_size_sufficient(4096, 2048, 8, torch.bfloat16)  # True (same total)

        >>> # Explicit backend selection
        >>> workspace = create_allreduce_fusion_workspace(
        ...     backend="mnnvl",
        ...     world_size=16,
        ...     rank=0,
        ...     max_token_num=2048,
        ...     hidden_dim=4096,
        ...     dtype=torch.bfloat16,
        ... )
        >>> print(workspace.backend)  # "mnnvl"
    Nrg   r]   r9   rc   zNo suitable backend found. re   r   )r   r   r   r   r   r   )rY   rZ   rh   r   )mappingmax_num_tokensr   r   r   buffer_size_in_byteszUnknown backend: )minr(   cudadevice_countr^   appendr`   rJ   rf   r   r   r   get_required_buffer_size_bytesr   ONESHOTRuntimeError)r<   rY   rZ   r   r   r   rh   r   ri   ra   selectedactual_backendrk   rm   s                 r5   !create_allreduce_fusion_workspacerw     s   d EJ3355zBB&"!'!
 
 
 	/ $$X...!!'!
 
 
 	. $$W---  	<:;;; 1/!'!
 
 
 "!  !!-'!%
 
 
 	
 
7	"	"!'	
 
 
  $  		-L!08  ! -(!%!5
 
 
 	
 ?~??@@@r6   ư>input	workspacepatternlaunch_with_pdloutputresidual_outnorm_out	quant_out	scale_outresidual_in	rms_gammarms_epsscale_factorlayout_coderH   fp32_accc           
      x   t          |t                    r| j        \  }}|t          j        |           }d } || d          } ||d          }|	 ||	d          nd}| ||d          nd}| ||d          nd}| ||d          nd}t          d%i d	|d
|j        d|j        d|d|d|j        d|d|d|d|d|d|d|d|d|d|d|d|
d|d|d|d|j	         ||S ||S |S t          |t                    r|t          j        k    r#|t          j        k    rt          d| d          |t          d          |t          j        k    r+|t          j        |           }t          | |||           |S |t          j        k    rj|	t          d          |
t          d           |t          j        |           }|t          j        |           }t!          | |	|
|||||!          \  }}|S t          d"|           t#          d#t%          |           d$          )&a  
    AllReduce + RMSNorm fusion operation.

    Backend is automatically determined from workspace type. If you need another backend, create the workspace for the desired backend.

    Supports multiple fusion patterns:
    - AllReduce only
    - AllReduce + Residual + RMSNorm
    - AllReduce + Residual + RMSNorm + Quantization (FP8/FP4)

    **Note on Workspace Reusability:**
    You can reuse the same workspace with different (token_num, hidden_dim) combinations
    as long as `workspace.is_buffer_size_sufficient(token_num, hidden_dim, tp_size, dtype)` returns True.

    Args:
        input: Input tensor [token_num, hidden_dim]
        workspace: Workspace object (type determines backend, see create_allreduce_fusion_workspace)
        pattern: Fusion pattern (AllReduceFusionPattern constant, 0-5)
                 - kAllReduce = 0
                 - kARResidualRMSNorm = 1
                 - kARResidualRMSNormFP8Quant = 2
                 - kARResidualRMSNormFP4Quant = 3
                 - kARResidualRMSNormOutFP8Quant = 4
                 - kARResidualRMSNormOutFP4Quant = 5
                 Note: MNNVL only supports patterns 0 and 1
        launch_with_pdl: Use Persistent Dependency Launch

        # ===== OUTPUT tensors (pre-allocated, filled by function) =====
        output: AllReduce output [token_num, hidden_dim]
        residual_out: Prenorm output (after residual add, before norm) [token_num, hidden_dim]
        norm_out: Normalized output [token_num, hidden_dim]
        quant_out: Quantized output [token_num, hidden_dim] [trtllm only]
        scale_out: Quantization scale factors [trtllm only]

        # ===== INPUT parameters =====
        residual_in: Residual tensor to ADD [token_num, hidden_dim]
        rms_gamma: RMSNorm weight [hidden_dim]
        rms_eps: RMSNorm epsilon for numerical stability
        scale_factor: Input scale factor for quantization [trtllm only]
        layout_code: Scale factor layout (QuantizationSFLayout) [trtllm only]

        # ===== Control parameters =====
        use_oneshot: Use oneshot strategy vs twoshot
                     If None, uses internal heuristics.
                     Note: when explicitly set to True, the MNNVL backend needs to be initialized with a sufficiently large workspace.
        fp32_acc: [trtllm only] Use FP32 accumulation for AllReduce

    Returns:
        Output tensor (typically norm_out for fusion cases, output otherwise)

    Examples:
        >>> # Basic AllReduce + Residual + RMSNorm
        >>> workspace = create_allreduce_fusion_workspace(
        ...     backend="auto",
        ...     world_size=8,
        ...     rank=0,
        ...     max_token_num=2048,
        ...     hidden_dim=4096,
        ...     dtype=torch.bfloat16,
        ... )
        >>>
        >>> # Pre-allocate output tensors
        >>> prenorm = torch.empty_like(hidden_states)
        >>> normed = torch.empty_like(hidden_states)
        >>>
        >>> # Call fusion - backend inferred from workspace type
        >>> output = allreduce_fusion(
        ...     input=hidden_states,
        ...     workspace=workspace,
        ...     pattern=AllReduceFusionPattern.kARResidualRMSNorm,
        ...     launch_with_pdl=True,
        ...     residual_out=prenorm,
        ...     norm_out=normed,
        ...     residual_in=residual,
        ...     rms_gamma=norm_weight
        ... )
        >>> # output == normed (final result)

        >>> # With FP8 quantization
        >>> quant = torch.empty_like(hidden_states, dtype=torch.float8_e4m3fn)
        >>> scales = torch.empty(token_num * hidden_dim // 16, dtype=torch.float16)
        >>>
        >>> output = allreduce_fusion(
        ...     input=hidden_states,
        ...     workspace=workspace,
        ...     pattern=AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
        ...     norm_out=normed,
        ...     quant_out=quant,
        ...     scale_out=scales,
        ...     residual_in=residual,
        ...     rms_gamma=norm_weight,
        ...     scale_factor=scale_tensor
        ... )
    Nc                 x    |                                  st          | d          |                     d          S )Nz must be contiguous)is_contiguousrJ   view)trE   s     r5   _flatten_checkedz*allreduce_fusion.<locals>._flatten_checked@  s;    ??$$ ? D!=!=!=>>>66"::r6   ry   r}   r   r~   r   r   allreduce_inrY   
world_rank	token_numr   workspace_ptrsr|   trigger_completion_at_endr   pattern_coderH   allreduce_outr   r   r   r   r   r1   z4MNNVL AllReduce+RMS fusion does not support pattern z(. Please try the TRTLLM backend instead.zLMNNVL AllReduce does not support quantization fusion and thus no layout_code)ry   rz   r|   r}   z/MNNVL AllReduce+RMS fusion requires residual_inz-MNNVL AllReduce+RMS fusion requires rms_gamma)ry   r   gammarz   epsilonr}   r~   r|   z'Unsupported pattern for MNNVL backend: zUnknown workspace type: zJ. Expected TRTLLMAllReduceFusionWorkspace or MNNVLAllReduceFusionWorkspacer:   )
isinstancer   shaper(   
empty_liker   rY   rZ   r/   r1   r   r   kARResidualRMSNorm
kAllReducerJ   r   r   	TypeErrorrB   )ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   rH   r   r   r   r   
input_flatoutput_flatresidual_in_flatresidual_out_flatnorm_out_flatquant_out_flatnorm_resultresidual_results                              r5   allreduce_fusionr     s   j );<< E
 !&	: >%e,,F	 	 	
 &%eW55
&&vx88 & [-888 	 ' \>::: 	 7?6JXz222PT 	 9B8MY444SW 	 	  	
 	
 	
#	
 ++	
 !~~	
  i		

 "z	
 %55	
 ,O	
 '6o	
 X	
 !	
 $	
 &+	
 )(	
 +*	
 #]	
  %n!	
"  i#	
$  i%	
& G'	
( &)	
* $+	
, ''-	
 	
4 O"M	I<	=	= =
-@@@1<<<xwxxx   "^  
 ,777~)%00"# /	    M.AAA " !RSSS  !PQQQ  +E22#$/66 ,T'#) /	, 	, 	,(K  PwPPQQQ XtI X X X
 
 	
r6   )	rg   NNNNNNNF)FNNNNNNNrx   NNNF)*rS   typingr   r   r   r   r   r   r	   workspace_baser   r(   flashinfer.api_loggingr   	trtllm_arr   r   r   rk   r   rc   r   r   r   trtllm_mnnvl_arr   r   r   r   r   rV   r+   r   rW   r^   r`   listrf   rw   r,   floatr   r:   r6   r5   <module>r      s     D D C C C C C C C C C C C C C C C C C 4 4 4 4 4 4  1 1 1 1 1 1 . . . . . . H H H H H H G G G G G G       0 0 0 0 0 0 0 0 . - - - - - : : : : : : 9 9 9 9 9 9 3 3 3 3 3 3 E E E E E E$W W W W W%= W W W~  	
  ; 
   "  	
  ; 
   *,&Cy,&,& ,& 	,&
 ,& ,& ;,& 
#Y,& ,& ,& ,&h 28*."'bA bA./bAbA bA 	bA
 bA ;bA bA ;'bA  bA bA bA bA bAT 
 "%)+/'+(,(,*.(,9=!%"&'y
 y
<y
'y
 y
 	y
 U\"y
 5<(y
 u|$y
 %y
 %y
 %,'y
 %y
 y
 5u!456y
  #!y
$ $%y
& 'y
( \)y
 y
 y
 y
 y
 y
r6   