
    )`i                         d dl mZ ddlmZ ddlZddlmZ ddlZddlm	Z	m
Z
mZ d Z e
dg          d	             Z e
dg          d
             Zej        d             Z ei e          e	 ddej        dej        dej        deddf
d                        Z ei e          e	 ddej        dej        dej        deddf
d                        ZdS )   )flashinfer_api    )gen_dsv3_router_gemm_moduleN)SimpleNamespace)register_custom_opsupported_compute_capabilitybackend_requirementc                    |                                  dk    rt          d          |                                 dk    rt          d          |                                 dk    rt          d          |                     d          dk    rt          d          |                    d          dk    rt          d          |                    d          dk    rt          d	          | j        d         |j        d         k    rt          d
          |j        d         | j        d         k    rt          d          |j        d         |j        d         k    rt          d          d}d}d}| j        d         |k     s| j        d         |k    rt          d| d|           | j        d         |k    rt          d|           |j        d         |k    rt          d|           | j        t
          j        k    rt          d          |j        t
          j        k    rt          d          |j        |k    rt          d| d          dS )Nr   zmat_a must be a 2D tensorzmat_b must be a 2D tensorzout must be a 2D tensor   zmat_a must be row-majorzout must be row-majorr   zmat_b must be column-majorz.mat_a.shape[1] must be equal to mat_b.shape[0]z,out.shape[0] must be equal to mat_a.shape[0]z,out.shape[1] must be equal to mat_b.shape[1]i      z,mat_a.shape[0] (num_tokens) must be between z and z-mat_a.shape[1] (hidden_dim) must be equal to z.mat_b.shape[1] (num_experts) must be equal to zmat_a must be a bfloat16 tensorzmat_b must be a bfloat16 tensorzout must be a z tensorT)dim
ValueErrorstrideshapedtypetorchbfloat16)	mat_amat_boutlaunch_with_pdlexpected_num_expertsexpected_out_dtypeexpected_hidden_dim
min_tokens
max_tokenss	            s/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/gemm/routergemm_dsv3.py_mm_M1_16_K7168_shape_checksr      sp    yy{{a4555yy{{a4555
wwyyA~~2333 ||A!2333
zz!}}0111||A!5666{1~Q''IJJJ
y|u{1~%%GHHH
y|u{1~%%GHHH JJ{1~
""ek!nz&A&AX:XXJXX
 
 	
 {1~,,,Q<OQQ
 
 	
 {1~---S=QSS
 
 	

 {en$$:;;;{en$$:;;;
y&&&E*<EEEFFF4    d   c                 @    t          | |||dt          j                  S )N   r   r   )r   r   float32r   r   r   r   s       r   !_mm_M1_16_K7168_N256_shape_checksr&   D   s-    '  =   r   c                 @    t          | |||dt          j                  S )N   r#   )r   r   r   r%   s       r   !_mm_M1_16_K7168_N128_shape_checksr)   Q   s-    '  >   r   c                     t                                                      t          ddg          	 ddt          j        dt          j        dt          j        dt
          dd f
fd	            } t          d
dg          	 ddt          j        dt          j        dt          j        dt
          dd f
fd            }t          | |          S )Nzflashinfer::ml3_router_gemm_opr   )mutates_argsFr   r   r   returnc                 8                         | |||           d S N)ml3_router_gemm_opr   r   r   r   modules       r   mm_M1_16_K7168_N128z8get_dsv3_router_gemm_module.<locals>.mm_M1_16_K7168_N128a   s%     	!!%_EEEEEr   zflashinfer::dsv3_router_gemm_opc                 8                         | |||           d S r.   )dsv3_router_gemm_opr0   s       r   mm_M1_16_K7168_N256z8get_dsv3_router_gemm_module.<locals>.mm_M1_16_K7168_N256m   s%     	""5%oFFFFFr   )r2   r5   F)r   build_and_loadr   r   Tensorboolr   )r2   r5   r1   s     @r   get_dsv3_router_gemm_moduler:   ]   sS   (**99;;F(W   !&	F F|F|F \F 	F
 
F F F F F	 F )W   !&	G G|G|G \G 	G
 
G G G G G	 G //   r   )common_checkFr   r   r   r   r,   c                 N    t                                          | |||           dS )a0  Optimized GEMM for the router operation in Mistral Large 3.

    This function performs a highly optimized matrix multiplication specifically tailored
    for the expert routing GEMM in Mistral Large 3's Mixture of Experts (MoE) architecture.
    It computes out = mat_a @ mat_b where mat_a contains token embeddings and mat_b
    contains expert routing weights.

    The implementation is optimized for the specific problem dimensions used in Mistral Large 3:
    - Hidden dimension (K): 7168
    - Number of experts (N): 128
    - Number of tokens (M): 1-16

    Args:
        mat_a (torch.Tensor): Input token embeddings of shape (M, K) where M is the number
            of tokens (1-16) and K is the hidden dimension (7168). Must be bfloat16,
            row-major (contiguous).
        mat_b (torch.Tensor): Expert routing weights of shape (K, N) where K is the hidden
            dimension (7168) and N is the number of experts (128). Must be bfloat16,
            column-major (transposed layout).
        out (torch.Tensor): Pre-allocated output tensor of shape (M, N) containing the
            routing scores. Must be bfloat16, row-major (contiguous). This tensor is
            mutated in-place.
        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
            Device-side Launch. Defaults to False.

    Returns:
        None: The result is written directly to the `out` tensor.

    Raises:
        ValueError: If tensor dimensions, strides, or data types do not match the
            expected Mistral Large 3 router configuration.

    Note:
        This kernel is specialized for compute capability 10.0 (Blackwell architecture).
        The specific problem size optimization makes this significantly faster than
        general-purpose GEMM implementations for the router operation.
    N)r:   r2   r%   s       r   r2   r2      6    Z  !!55uc?    r   c                 N    t                                          | |||           dS )a  Optimized GEMM for the router operation in DeepSeek-V3.

    This function performs a highly optimized matrix multiplication specifically tailored
    for the expert routing GEMM in DeepSeek-V3's Mixture of Experts (MoE) architecture.
    It computes out = mat_a @ mat_b where mat_a contains token embeddings and mat_b
    contains expert routing weights.

    The implementation is optimized for the specific problem dimensions used in DeepSeek-V3:
    - Hidden dimension (K): 7168
    - Number of experts (N): 256
    - Number of tokens (M): 1-16

    Args:
        mat_a (torch.Tensor): Input token embeddings of shape (M, K) where M is the number
            of tokens (1-16) and K is the hidden dimension (7168). Must be bfloat16,
            row-major (contiguous).
        mat_b (torch.Tensor): Expert routing weights of shape (K, N) where K is the hidden
            dimension (7168) and N is the number of experts (256). Must be bfloat16,
            column-major (transposed layout).
        out (torch.Tensor): Pre-allocated output tensor of shape (M, N) containing the
            routing scores. Must be float32, row-major (contiguous). This tensor is
            mutated in-place.
        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
            Device-side Launch. Defaults to False.

    Returns:
        None: The result is written directly to the `out` tensor.

    Raises:
        ValueError: If tensor dimensions, strides, or data types do not match the
            expected DeepSeek-V3 router configuration.

    Note:
        This kernel is specialized for compute capability 10.0 (Blackwell architecture).
        The specific problem size optimization makes this significantly faster than
        general-purpose GEMM implementations for the router operation.
    N)r:   r5   r%   s       r   r5   r5      r=   r   r6   )api_loggingr   flashinfer.jitr   	functoolstypesr   r   flashinfer.utilsr   r   r	   r   r&   r)   cacher:   r8   r9   r2   r5    r   r   <module>rF      s   ( ( ( ( ( ( 6 6 6 6 6 6     ! ! ! ! ! !          3 3 3n se$$  %$ se$$  %$   B R&GHHH
 "	- -<-<- 
- 	-
 
- - -  IH-` R&GHHH
 "	- -<-<- 
- 	-
 
- - -  IH- - -r   