
    )`iC                     ,   d dl mZ d dlmZ d dlZd dlmZ d dlZd dlm	Z	m
Z
mZ  e
g d          d             Zej        d             Z ei e	          e	 ddej        dej        dededededej        dej        deddfd                        ZdS )    )flashinfer_api)gen_dsv3_fused_routing_moduleN)SimpleNamespace)register_custom_opsupported_compute_capabilitybackend_requirement)Y   Z   d   g   x   y   c	                    | j         d         }	||z  |k     s||k    rt          d||z   d| d| d| d	          |dk    rV|	|z  }
|
|z  }|dk    rt          d| d	          |
d
k    rt          d|
 d          |dk    rt          d| d          n2|	dk    rt          d|	 d          |dk    rt          d| d	          dS )a  Validate configuration parameters for DSv3 fused routing kernel.

    Args:
        scores: Input routing scores tensor
        bias: Per-expert routing bias tensor
        n_group: Number of expert groups
        topk_group: Number of top groups to select
        topk: Number of top experts to select per token
        routed_scaling_factor: Scaling factor for normalized weights
        topk_values: Output tensor for normalized expert weights
        topk_indices: Output tensor for selected expert indices
        launch_with_pdl: Whether to use Persistent Device-side Launch

    Raises:
        ValueError: If configuration is invalid or exceeds kernel limits
       z-Invalid configuration: topk_group * n_group (z) must be >= topk (z) and topk_group (z) must be <= n_group ()   z-Invalid configuration for n_group > 1: topk (z) must be <= 8    z>Invalid configuration for n_group > 1: num_experts / n_group (z) must be <= 32   zKInvalid configuration for n_group > 1: num_experts / n_group * topk_group (z) must be <= 128i  z4Invalid configuration for n_group = 1: num_experts (z) must be <= 384z-Invalid configuration for n_group = 1: topk (T)shape
ValueError)scoresbiasn_group
topk_grouptopkrouted_scaling_factortopk_valuestopk_indiceslaunch_with_pdlnum_expertsexperts_per_groupmax_experts_in_selected_groupss               {/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/fused_moe/fused_routing_dsv3.py#_check_dsv3_fused_routing_supportedr$      s   : ,q/K Gd""j7&:&:LJ<P L Lei L L)L LAHL L L
 
 	
 {{''1):Z)G&!88TTTT   r!!7%7 7 7   *C//E2E E E   0 d{ddd   !88TTTT   4    c                  P   t                                                      t          dddg          	 ddt          j        dt          j        dt
          d	t
          d
t
          dt          dt          j        dt          j        dt          dd ffd            } t          |           S )Nzflashinfer::NoAuxTcr   r   )mutates_argsTr   r   r   r   r   r   r   returnc	                 B    	                     | ||||||||	  	         d S )NNoAuxTc)
r   r   r   r   r   r   r   r   r   modules
            r#   r+   z.get_dsv3_fused_routing_module.<locals>.NoAuxTcW   s@     	!
	
 
	
 
	
 
	
 
	
r%   r*   T)	r   build_and_loadr   torchTensorintfloatboolr   )r+   r,   s    @r#   get_dsv3_fused_routing_moduler4   S   s    *,,;;==F#^4   !%
 

l
 
 	

 
  %
 \
 l
 
 

 
 
 
 
	 
.    r%   )common_checkTr   r   r   r   r   r   r   r   r   r(   c	                 X    t                                          | ||||||||	  	         dS )a  Fused expert routing with top-k selection for DeepSeek-V3.

    This function performs a highly optimized fused routing operation specifically
    designed for DeepSeek-V3's Mixture of Experts (MoE) architecture with grouped
    expert routing and no auxiliary loss. It combines score computation, expert
    selection, and normalization into a single kernel operation.

    The routing algorithm consists of the following steps:
    1. Compute biased scores: sigmoid(scores) + bias for each expert
    2. Group experts and compute group scores (sum of top-2 experts per group)
    3. Select top-k groups based on group scores
    4. From selected groups, select top-k experts based on biased scores
    5. Normalize selected expert weights: sigmoid_scores / sum(sigmoid_scores) * scale

    Args:
        scores (torch.Tensor): Input routing scores of shape (num_tokens, num_experts).
            The logits produced by the router network before activation. Supports
            bfloat16, float16, or float32.
        bias (torch.Tensor): Per-expert routing bias of shape (num_experts,). Added to
            sigmoid-activated scores to produce biased scores for expert selection.
            Must match the dtype of scores.
        n_group (int): Number of expert groups. Experts are divided into groups for
            hierarchical selection. Typical value is 8 for DeepSeek-V3 with 256 experts
            (32 experts per group).
        topk_group (int): Number of top groups to select. Must be <= n_group. Typical
            value is 4, meaning the top 4 groups are selected from 8 groups.
        topk (int): Number of top experts to select per token. Must be <= num_experts.
            Typical value is 8, meaning 8 experts are routed per token.
        routed_scaling_factor (float): Scaling factor applied to normalized expert
            weights. The final output weights are:
            sigmoid_scores / sum(sigmoid_scores) * routed_scaling_factor.
        topk_values (torch.Tensor): Pre-allocated output tensor of shape
            (num_tokens, topk) for the normalized expert weights. Must be float32.
            This tensor is mutated in-place.
        topk_indices (torch.Tensor): Pre-allocated output tensor of shape
            (num_tokens, topk) for the selected expert indices. Must be int32 or int64.
            This tensor is mutated in-place.
        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
            Device-side Launch. Defaults to True.

    Returns:
        None: Results are written directly to `topk_values` and `topk_indices` tensors.

    Note:
        - The kernel uses float32 internally for all computations to ensure numerical
          precision, even when inputs are float16 or bfloat16.
        - This implementation is optimized for Hopper (compute capability 90, 100),
          Ada (compute capability 89), and Blackwell (compute capability 120, 121)
          architectures.
        - The "NoAux" prefix indicates this variant does not compute auxiliary losses
          (e.g., load balancing loss) during routing.
        - The "Tc" suffix indicates the use of Tensor Core optimizations in the
          underlying CUDA kernel.
    N)r4   r+   )	r   r   r   r   r   r   r   r   r   s	            r#   fused_topk_deepseekr7   w   sH    F "##++
 
 
 
 
r%   r-   )flashinfer.api_loggingr   flashinfer.jitr   	functoolstypesr   r/   flashinfer.utilsr   r   r   r$   cacher4   r0   r1   r2   r3   r7    r%   r#   <module>r?      s   1 1 1 1 1 1 8 8 8 8 8 8     ! ! ! ! ! !           :::;;B B <;BJ      F R&IJJJ !K KLK
,K K 	K
 K !K K ,K K 
K K K  KJK K Kr%   