
    )`ig                    +   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlZddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl#m%Z% ddl#m&Z& ddl#m'Z' ddl#m(Z( ddl#m)Z) ddl#m*Z* ddl#m+Z+ ddl#m,Z, ddl#m-Z- ddl#m.Z. ddl/m0Z0 ddl#m1Z1 dZ2	 ddl3Z3dZ2nM# e4$ r Y nFe5$ r?Z6 e7e6          8                                Z9 e:d dD                       Z;e;s Y dZ6[6ndZ6[6ww xY wddl<m=Z= ddlm>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD dZEd ZFd!ejG        d"eHe7         fd#ZIejJ        d$             ZK e"d%d&g          dejL        ddd'fd(ejM        d)ejM        d*e	ejM                 d+ejN        d,e	ejM                 d-eOd.ed/         fd0            ZP e"d%d&g          dejL        ddd'fd(ejM        d)ejM        d*e	ejM                 d+ejN        d,e	ejM                 d-eOd.ed/         fd1            ZQdddejL        d'fd(ejM        d)ejM        d,e	ejM                 d-eOd*e	ejM                 d+ejN        d.ed/         fd2ZRdddejL        d'fd3ee7         d(ejM        d)ejM        d,e	ejM                 d-eOd*e	ejM                 d+ejN        d.ed/         fd4ZS e!ePeQd5eReS6          edddejL        d'fd(ejM        d)ejM        d,e	ejM                 d-eOd*e	ejM                 d+ejN        d.ed/         d7ejM        fd8                        ZT e"d%d&g          dejL        d9fd:ejM        d;ejM        d*e	ejM                 d+ejN        d.ed9         f
d<            ZUdejL        d9fd:ejM        d;ejM        d*e	ejM                 d+ejN        d.ed9         f
d=ZVdejL        d9fd3ee7         d:ejM        d;ejM        d*e	ejM                 d+ejN        d.ed9         fd>ZW e!d9eUieVeW6          edejL        d9fd:ejM        d;ejM        d*e	ejM                 d+ejN        d.ed9         d7ejM        fd?                        ZXejJ        d@             ZYejJ        dA             ZZejJ        dB             Z[ejJ        dC             Z\ejJ        dD             Z] e edEdFee          f edGdHdI           fJ          Z^ejJ        dK             Z_ e edEdFee          f edGdHdL           fJ          Z`d(ejM        d)ejM        d,ejM        d-eOd*ejM        dMejM        dNee7         d7dfdOZad(ejM        d)ejM        dPejM        dQejM        d*ejM        dMejM        dNee7         d7dfdRZbdSe7dTe7fdUZcejJ        dV             ZdejJ        dW             ZedXeffdYZgejJ        ejL        dfdZejN        d[eOfd\            Zhe	 	 dd(ejM        d)ejM        d,ejM        d-eOd*e	ejM                 d7ejM        fd]            ZiejJ        d^             Zj	 dd_ejM        d`ejM        daejM        dbeOdcefddejM        dee	ejM                 fdfZk	 dd_ejM        d`ejM        daejM        dbeOdcefddejM        dee	ejM                 fdgZl G dh di          Zm G dj dke          Zndl Zodm Zpdn Zqdardoejs        jt        fdpZudZejN        fdqZvdZejN        fdrZwejJ        ds             ZxejJ        	 ddueffdv            Zy	 ddueffdwZz	 ddueffdxZ{ejJ        dy             Z|dz Z}dZejN        fd{Z~d|ejM        d(ejM        d)ejM        d}ejM        d~ejM        d*e	ejM                 dejN        fdZd Zd Zd ZedejL        ddfd(ejM        d)ejM        de	ejM                 d+ejN        d*e	ejM                 d.ed         fd            ZdejL        ddddtfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOdueffdZdejL        dddddtfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOdMejM        dueffdZd ZdejL        dddddfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOd.ed         deOfdZ e"g d          dejL        dddddfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOd.ed         deOfd            Z e"d%d&g          dejL        dddddfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOd.ed         deOfd            Z e"g d          dejL        dddddfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOd.ed         deOfd            ZdejL        dddddfd3ee7         d(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOd.ed         deOfdZd Z e edEdEee          f eddd            eddd           fJ          Z e edEdEee          f eddd            eddd           fJ          Z e!eeedee6          edejL        dddddfd(ejM        d)ejM        dejM        dejM        de	ejM                 d+ejN        d*e	ejM                 defdeOd.ed         deOd7ejM        fd                        Z e"g d          	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fd            Z e"g d          	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fd            Z e"g d          	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fd            Z	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fdZ	 	 dd3ee7         d:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fdZ e!eeedee6          e	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         d7ejM        fd                        Z e"g d          	 	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        de	ed                  defde
efefeff         d*e	ejM                 d+e	ejN                 d.ed         fd            Z e"d%d&g          	 	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        de	ed                  defde
efefeff         d*e	ejM                 d+e	ejN                 d.ed         fd            Z	 	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        de	ed                  defde
efefeff         d*e	ejM                 d+e	ejN                 d.ed         fdZ e!eede          e	 	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        de	ed                  defde
efefeff         d*e	ejM                 d+e	ejN                 d.ed         d7ejM        fd                        ZejJ        d             Z e"g d          	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        de	ed                  defd*e	ejM                 d+e	ejN                 fd            Z e!i e          e	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        de	ed                  defd*e	ejM                 d+e	ejN                 d7ejM        fd                        Z e"g d          	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        de
efefeff         ded         defd*e	ejM                 d+e	ejN                 fd            Z e!i e          e	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        de
efefeff         ded         defd*e	ejM                 d+e	ejN                 d7ejM        fd                        Z e"g d          	 	 	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        defdefdefdefdeOd*e	ejM                 d+e	ejN                 fdƄ            Z e!i e          e	 	 	 	 	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        defdefdefdefdeOd*e	ejM                 d+e	ejN                 d7ejM        fdǄ                        ZeZdejM        fdȄZejJ        dɄ             Z e"d%d&g          	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        de
efefeff         d*e	ejM                 d+e	ejN                 d7eOfd˄            Z e!i e          e	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        de
efefeff         d*e	ejM                 d+e	ejN                 fd̄                        Z e"d%d&g          	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        defde
efefeff         d*e	ejM                 d+e	ejN                 d7eOfdτ            Z e!i e          e	 	 	 dd(ejM        d)ejM        d}ejM        d~ejM        dejM        defde
efefeff         d*e	ejM                 d+e	ejN                 fdЄ                        ZejJ        dф             Ze	 	 	 	 ddejM        dejM        de	ejM                 de	ejM                 d*e	ejM                 d+e	ejN                 d7ejM        fdք            Zdefdefdefdefd7e
efefeff         f
dڄZejJ        dۄ             ZejL        dddtfd(ejM        d)ejM        d+ejN        d*e	ejM                 defdueffd݄ZejL        dddtfd(ejM        d)ejM        dejM        dejM        d+ejN        d*e	ejM                 dMejM        dueffdބZd߄ Zd(ejM        d)ejM        dPejM        dQejM        d*ejM        dMejM        dNee7         d7dfdZ e"d%d&g          	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fd            ZdZejN        fdZ	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fdZ	 	 dd3ee7         d:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         fdZ e!deiee6          e	 	 dd:ejM        d;ejM        dejM        dejM        dZejN        d*e	ejM                 d.ed         d7ejM        fd                        ZdS )a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)EnumSimpleNamespace)ListLiteralOptionalTuple)trtllm_low_latency_gemm   )flashinfer_api)	AutoTunerConstraintSpecDynamicTensorSpecOptimizationProfileTunableRunnerTuningConfig)&get_last_power_of_2_num_tokens_bucketslast_positive_power_of_2)get_native_fp4_dtypeis_sm100a_supportedis_sm100f_supportedis_sm120a_supportedis_sm121a_supportedLibraryErrorbackend_requirementsupported_compute_capability)gen_gemm_sm90_module)gen_gemm_module)gen_gemm_sm100_module)gen_gemm_sm120_module)!gen_gemm_sm120_module_cutlass_fp4)!gen_gemm_sm100_module_cutlass_fp4)!gen_gemm_sm100_module_cutlass_fp8)"gen_gemm_sm100_module_cutlass_bf16)gen_trtllm_gen_gemm_module)gen_tgv_gemm_sm10x_modulegen_deepgemm_sm100_module)get_cuda_version)#gen_fp8_blockscale_gemm_sm90_moduleFTc              #   (   K   | ]}|t           v V  d S N)	error_msg).0exts     m/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/gemm/gemm_base.py	<genexpr>r1   H   s'      EEc	)EEEEEE    )z.soz.dll)setup_cubin_loader)_get_cache_bufdetermine_gemm_backend
get_indptr	is_float8register_custom_opregister_fake_opget_compute_capabilityi   zecudnn FP4 GEMM with mxfp4 quantization is not supported on SM120 with cuDNN backend version < 9.14.0.device
sm_versionc                 @    t          |           \  }}|dz  |z    }||v S )N
   )r:   )r;   r<   majorminordevice_archs        r0   _match_sm_versionrB   ^   s/    )&11LE5RZ%')K*$$r2   c                     t                                                      fd} t          dd          dt          j        dt          j        dt          j        dt          j        d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          j        dt
          dd ffd            }t          d          dt          j        dt          j        dt          j        dt          j        d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          j        dt
          dd fd            }t          | |          }|S )Nc                  B     G fddt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         fdZ	 	 ddeej                 dede	dej        f fd	Z
d
S )Lget_gemm_module.<locals>.cublas_fp8_gemm_runner.<locals>.CublasFp8GemmRunnerinputsprofilereturnc                     dgS Nr    selfrG   rH   s      r0   get_valid_tacticsz^get_gemm_module.<locals>.cublas_fp8_gemm_runner.<locals>.CublasFp8GemmRunner.get_valid_tacticsk   s     s
r2   Ftacticdo_preparationc           	          t           j                                        }|\  }}}}	}
}                    |||
||	||           |
S r,   )torchcudacurrent_blas_handlebmm_fp8)rN   rG   rQ   rR   kwargscublas_handleabscale_ascale_boutworkspace_buffermodules               r0   forwardzTget_gemm_module.<locals>.cublas_fp8_gemm_runner.<locals>.CublasFp8GemmRunner.forwards   sX     !&
 > > @ @@F=1gw-=q#w1A=   
r2   NrP   F__name__
__module____qualname__r   rT   Tensorr   intrO   boolra   r`   s   r0   CublasFp8GemmRunnerrF   j   s        U\* - c	    !',	 U\*  !%	        r2   rk   r   )rk   r`   s    r0   cublas_fp8_gemm_runnerz/get_gemm_module.<locals>.cublas_fp8_gemm_runneri   sH    	 	 	 	 	 	 	- 	 	 	. #"$$$r2   z flashinfer::cutlass_segment_gemmymutates_argsr_   all_problemsx_dataw_datay_datax_ldw_ldy_ldempty_x_dataweight_column_majorrI   c                 D                         | ||||||||	|

  
         d S r,   )cutlass_segment_gemm)r_   rq   rr   rs   rt   ru   rv   rw   rn   rx   ry   r`   s              r0   r{   z-get_gemm_module.<locals>.cutlass_segment_gemm   sE     	##	
 	
 	
 	
 	
r2   c                     d S r,   rL   )r_   rq   rr   rs   rt   ru   rv   rw   rn   rx   ry   s              r0   _fake_cutlass_segment_gemmz3get_gemm_module.<locals>._fake_cutlass_segment_gemm   s	     	r2   )rm   r{   )r   build_and_loadr8   rT   rg   ri   r9   r   )rm   r{   r}   _gemm_moduler`   s       @r0   get_gemm_moduler   d   s   --//F% % % % %8 :#OOO
,
l
 
 	

 
 l
 l
 l
 <
 l
 "
 

 
 
 
 
 PO
4 899,l  	
  l l l < l " 
   :9  #51  L
 r2   d   g   tgvrZ   r[   r^   	out_dtypebiaspdlbackend)cutlassr   autoc                 h    |t          d          |rt          d          t          |           dS )NzLYou cannot use the CUTLASS backend with a bias. Use the TGV backend instead.zFThe CUTLASS backend does not support PDL. Use the TGV backend instead.T)
ValueError_validate_bf16_output_dtyperZ   r[   r^   r   r   r   r   s          r0   _cutlass_mm_bf16_requirementr      sS     Z
 
 	
  
T
 
 	
  	***4r2   c                 D    |t           j        k    rt          d          dS )NzWYou cannot provide an output dtype to the TGV backend. Use the CUTLASS backend instead.T)rT   bfloat16r   r   s          r0   _tgv_gemm_requirementr      s-     EN""e
 
 	
 4r2   c                    | j         t          j        k    rt          d| j          d          |j         t          j        k    rt          d|j          d          |-|j         t          j        k    rt          d|j          d          dS )N#First tensor has unsupported dtype . Only bfloat16 is supported.$Second tensor has unsupported dtype z"Bias tensor has unsupported dtype TdtyperT   r   r   )rZ   r[   r   r   r^   r   r   s          r0   _check_mm_bf16_problem_sizer      s     	w%.  X!'XXX
 
 	
 	w%.  Y17YYY
 
 	
 DJ%.88ZZZZ
 
 	
 4r2   suitable_backendsc                     g }||rd| v r|                     d           n2d| v r|                     d           d| v r|                     d           |S )Nr   r   append)	r   rZ   r[   r   r   r^   r   r   heuristic_backendss	            r0   _heuristic_func_mm_bf16r      s|     3%%%%%e,,,)))%%i000%%%%%e,,,r2   )r   r   )common_checkheuristic_funcrI   c           
         |5t          j        | j        d         |j        d         f| j        |          }n|j        | j        d         |j        d         fk    r3t	          d| j        d         |j        d         f d|j         d          |j        | j        k    r t	          d| j         d|j         d          |j        |k    rt	          d	| d|j         d          t          d
t          | j                  }|dk    rt          j	        }n?|dk    rt          dg| |dd|||          }n!|dk    rt          dg| ||||||          }n|g}t          | ||||||           |S )a{  MM BF16

    Parameters
    ----------
    a: torch.Tensor
        Input tensor, shape (m, k), bf16 in row-major layout.

    b: torch.Tensor
        Weight tensor, shape (k, n), bf16 in column-major layout.

    bias: Optional[torch.Tensor]
        Optional bias tensor, shape (n,). If provided, can only be used with the TGV backend. Defaults to ``None``.

    pdl: bool
        Whether to use persistant data loader mode. Can only be used with the TGV backend. Defaults to ``False``.

    out: Optional[torch.Tensor]
        Out tensor, shape (m, n), bf16 or fp16. If provided, can only be used with the CUTLASS backend. Defaults to ``None``.

    out_dtype: torch.dtype
        Output dtype, bf16 or fp16. If provided, can only be used with the CUTLASS backend. Defaults to ``torch.bfloat16``.

    backend: Literal["cutlass", "tgv", "auto"]
        The backend to use for the operation. Defaults to ``"tgv"``.
        ``"auto"`` allows selecting the best tactic from all available backends when autotune is enabled.

    Returns
    -------
    torch.Tensor
        Out tensor, shape (m, n), bf16 or fp16 in row-major layout.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> # Using the TGV backend
    >>> a = torch.randn([48, 64], device="cuda", dtype=torch.bfloat16)
    >>> b = torch.randn([80, 64], device="cuda", dtype=torch.bfloat16).transpose(-2, -1)
    >>> bias = torch.randn([80], device="cuda", dtype=torch.bfloat16)
    >>> out = flashinfer.mm_bf16(a, b, bias=bias, pdl=True, backend="tgv")
    >>> out.shape
    torch.Size([48, 80])
    >>> out.dtype
    torch.bfloat16
    >>> # Using the CUTLASS backend
    >>> fp16_out = torch.empty([48, 80], device="cuda", dtype=torch.float16)
    >>> out = flashinfer.mm_bf16(a, b, out=fp16_out, out_dtype=torch.float16, backend="cutlass")
    >>> out.shape
    torch.Size([48, 80])
    >>> out.dtype
    torch.float16
    Nr      r;   r    Output shape mismatch. Expected , got .!Output device mismatch. Expected  Output dtype mismatch. Expected mm_bf16_workspacer   r   Fr   )rT   emptyshaper;   r   r   r4   DEFAULT_WORKSPACE_SIZEmm_bf16suitable_auto_backendsr   bf16_gemm_sm100)	rZ   r[   r   r   r^   r   r   r_   backendss	            r0   r   r     s   N {kWQZ$8
 
 
 9QWQZ000_AGAJ
3K__SVS\___   :!!QAHQQCJQQQ   9	!!P9PPCIPPP   &3QX  &1	I		*KAtUCG
 
 
E		*GQ4c9g
 
 9Aq$S*:HEEEJr2   r   ABc                 $    t          |           dS NT)r   r   r   r^   r   r   s        r0   _cutlass_bmm_bf16_requirementr     s      	***4r2   c                     | j         t          j        k    rt          d| j          d          |j         t          j        k    rt          d|j          d          dS )Nr   r   r   Tr   r   s        r0   _check_bmm_bf16_problem_sizer     so     	w%.  X!'XXX
 
 	
 	w%.  Y17YYY
 
 	
 4r2   c                 <    g }d| v r|                     d           |S )Nr   r   )r   r   r   r^   r   r   r   s          r0   _heuristic_func_bmm_bf16r     s0     %%%!!),,,r2   c           	         | j         d         | j         d         |j         d         f}|t          j        || j        |          }n||j         |k    rt	          d| d|j          d          |j        | j        k    r t	          d	| j         d|j         d          |j        |k    rt	          d
| d|j         d          t          dt          | j                  }t          | |dd||dg           |S )a  BMM BF16

    Parameters
    ----------
    A: torch.Tensor
        Input tensor, shape (b, m, k), bf16 in row-major layout.

    B: torch.Tensor
        Weight tensor, shape (b, k, n), bf16 in column-major layout.

    out: Optional[torch.Tensor]
        Out tensor, shape (b, m, n), bf16 or fp16, defaults to ``None``.

    out_dtype: torch.dtype
        Output dtype, bf16 (default) or fp16.

    backend: Literal["cutlass"]
        Backend to use, defaults to "cutlass".

    Returns
    -------
    torch.Tensor
        Out tensor, shape (b, m, n), bf16 or fp16 in row-major layout.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> input = torch.randn([16, 48, 64], device="cuda", dtype=torch.bfloat16)
    >>> weight = torch.randn([16, 80, 64], device="cuda", dtype=torch.bfloat16).transpose(-2, -1)
    >>> out = flashinfer.bmm_bf16(input, weight)
    >>> out.shape
    torch.Size([16, 48, 80])
    >>> out.dtype
    torch.bfloat16
    r   r   r   Nr   r   r   r   r   r   bmm_bf16_workspaceFr   )	r   rT   r   r;   r   r   r4   r   r   )r   r   r^   r   r   expected_shaper_   s          r0   bmm_bf16r     s:   h gaj!'!*agaj9N
{k8
 
 
 9&&U>UUUUU   :!!QAHQQCJQQQ   9	!!P9PPCIPPP   &4ah  Aq$s,<ykJJJJr2   c                  F    t                                                      } | S r,   )r   r~   rj   s    r0   get_gemm_sm100_moduler         "$$3355FMr2   c                  F    t                                                      } | S r,   )r    r~   rj   s    r0   get_gemm_sm120_moduler     r   r2   c                  J    t                      fd} t          |           S )zJGet CUTLASS FP8 runner for SM120/SM121 using the groupwise scaling kernel.c                  B     G fddt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         fdZ	 	 ddeej                 dede	dej        f fd	Z
d
S )`get_gemm_sm120_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunnerrG   rH   rI   c                     dgS NrP   rL   rM   s      r0   rO   zrget_gemm_sm120_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunner.get_valid_tactics  s     tr2   rP   FrQ   rR   c                    |\  }}}}}	}
|                                 dk    r|                    dd          }n|}|                                 dk    r*|j        d         }|j        d         }|j        d         }d}n4|j        d         }|j        d         }|j        d         }|j        d         }d}d}d}d } |||          } |||          }||k    }||k    }|s|s|}|}n;|                                 dk    r|}|r<t          j        j                            |                                d||z
  f          }t          j        ||f|j	        |j
                  }|d |d |f                             |           n|}|r<t          j        j                            |                                d||z
  f          }t          j        |||f|j	        |j
                  }|                    dd          }|d d d |d |f                             |           |ra|                                 dk    r$t          j        ||f|	j
        |	j	        	          }n't          j        |||f|	j
        |	j	        	          }n|	}|                                dk    rV||z  |z   dz
  |z  }||z   dz
  |z  }|                    dd                              ||                                          }n|}|                                dk    rV||z  |z   dz
  |z  }||z   dz
  |z  }|                    dd                              ||                                          } n|} !                    |
|||| ||||d

  
         |r^|                                 dk    r"|	                    |d d d |f                    n$|	                    |d d d d d |f                    |	S )Nr   rP   r   r      c                     | |z   dz
  |z  |z  S Nr   rL   )xmultiples     r0   _pad_to_multiplezget_gemm_sm120_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunner.forward.<locals>._pad_to_multiple?  s    \A-(:hFFr2   r   r;   r   MN)dim	transposer   rT   nn
functionalpad
contiguouszerosr   r;   copy_r   numelviewexpandgemm_fp8_nt_groupwise)"rN   rG   rQ   rR   rX   rZ   r[   r\   r]   r^   r_   b_col_majorn_dimm_dimk_dim
batch_sizescale_gran_mscale_gran_nscale_gran_kr   n_paddedk_paddedneeds_n_paddingneeds_k_paddinga_paddedb_col_major_paddedb_underlying_padded
out_paddedscale_m_countscale_k_countscale_a_expandedscale_n_countscale_b_expandedr`   s"                                    r0   ra   zhget_gemm_sm120_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunner.forward  s    AG=1gw-= 5577a<< #$++b""5"5KK #$K 5577a<<'-a0EGAJEGAJE!"JJ'-a0EGAJEGAJE!"J ""G G G ,+E<@@++E<@@"*e"3"*e"3& Q Q H)4&& uuww!||#$* ',x':'>'> ( 3 3 5 58e;K7L( (H .3[%x0"-"3#.#5. . .*
 +6E66E6>:@@MMMM#$* ',x':'>'> ( 3 3 5 58e;K7L( (H /4k'8<"-"3#.#5/ / /+
 .A-J-J2r-R-R**111fuffuf+<=CCKPPP # %uuww!||%*["H-cj	& & &

 &+['9#&:"%)& & &

 "%J ==??a''"U*\9A=%%&M &.%<q%@\$QMQ**}==# %$ (/$==??a''"X-<q@%%&M &.%<q%@\$QMQ**}==# %$ (/$ ,,$&$$      # <uuww!||		*QQQY"78888		*QQQ6E6\":;;;
r2   Nrb   rc   rj   s   r0   CutlassFp8GemmRunnerr     s        U\* - c	    !',	Q QU\*Q Q !%	Q Q Q Q Q Q Q Q Qr2   r   rl   r   r`   s    r0   cutlass_fp8_gemm_runnerzBget_gemm_sm120_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner  sS    Z	 Z	 Z	 Z	 Z	 Z	 Z	= Z	 Z	 Z	x $#%%%r2   r   )r   r   r   r`   s    @r0   !get_gemm_sm120_module_cutlass_fp8r     sK     #$$F]& ]& ]& ]& ]&@  7   r2   c                      t                      } |                                 }t          |                                            |S r,   )r%   r~   r3   get_library_path)modops     r0   get_trtllm_gemm_moduler     s=    
$
&
&C					Bs++--...Ir2   c                  n    t                                                      fd} t          |           S )Nc                  B     G fddt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         f fdZ	 	 ddeej                 dede	dej        f fd	Z
d
S )`get_gemm_sm100_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunnerrG   rH   rI   c                 `    t          t                                                              S r,   )listrangefp8_gemm_tactic_numrN   rG   rH   r`   s      r0   rO   zrget_gemm_sm100_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunner.get_valid_tactics  '    
 E&"<"<">">??@@@r2   rP   FrQ   rR   c           	      x    |\  }}}}}	}
                     ||                    dd          |||	|
|           |	S Nr   rP   )fp8_gemmr   )rN   rG   rQ   rR   rX   rZ   r[   r\   r]   r^   r_   r`   s              r0   ra   zhget_gemm_sm100_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner.<locals>.CutlassFp8GemmRunner.forward  sZ     AG=1gw-=KKB''$   
r2   Nrb   rc   rj   s   r0   r   r    s        AU\*A -A c	A A A A A A !',	 U\*  !%	        r2   r   rl   r   s    r0   r   zBget_gemm_sm100_module_cutlass_fp8.<locals>.cutlass_fp8_gemm_runner  sH    	 	 	 	 	 	 	= 	 	 	6 $#%%%r2   r   )r#   r~   r   r   s    @r0   !get_gemm_sm100_module_cutlass_fp8r    sO    .00??AAF& & & & &>  7   r2   )r   r      r   c                     | d         d         S Nr   r   rL   shapess    r0   <lambda>r        6!9R= r2   dynamic_tensor_specsconstraint_specsc                  n    t                                                      fd} t          |           S )Nc                  B     G fddt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         f fdZ	 	 ddeej                 dede	dej        f fd	Z
d
S )cget_gemm_sm100_module_cutlass_bf16.<locals>.cutlass_bf16_gemm_runner.<locals>.CutlassBf16GemmRunnerrG   rH   rI   c                 `    t          t                                                              S r,   )r  r  bf16_gemm_tactic_numr  s      r0   rO   zuget_gemm_sm100_module_cutlass_bf16.<locals>.cutlass_bf16_gemm_runner.<locals>.CutlassBf16GemmRunner.get_valid_tactics  s'    
 E&"="="?"?@@AAAr2   rP   FrQ   rR   c                 t    |\  }}}}}}	
                     ||                    dd          ||	|           |S r  )	bf16_gemmr   )rN   rG   rQ   rR   rX   rZ   r[   _r^   r_   r`   s             r0   ra   zkget_gemm_sm100_module_cutlass_bf16.<locals>.cutlass_bf16_gemm_runner.<locals>.CutlassBf16GemmRunner.forward  sT     5;11aC!1  KKB''$   
r2   Nrb   rc   rj   s   r0   CutlassBf16GemmRunnerr    s        BU\*B -B c	B B B B B B !',	 U\*  !%	        r2   r"  rl   )r"  r`   s    r0   cutlass_bf16_gemm_runnerzDget_gemm_sm100_module_cutlass_bf16.<locals>.cutlass_bf16_gemm_runner  sH    	 	 	 	 	 	 	M 	 	 	2 %$&&&r2   )r#  )r$   r~   r   )r#  r`   s    @r0   "get_gemm_sm100_module_cutlass_bf16r$    sO    /11@@BBF' ' ' ' '8 !9   r2   c                     | d         d         S r  rL   r  s    r0   r  r  *  r  r2   r_   runner_namesc                    g }t          | j                  }d|v r3|                    t                                                                 d|v r:|                    t          | j        |                                                     |s
J d            t          j	                    }	| |||||g}
|	
                    d|t          |
          \  }} ||
|           d S )Nr   r   No suitable runners foundr   rG   rQ   )r   r;   r   r$  r#  get_tgv_gemm_sm10x_moduler   tgv_gemm_runnerr   get
choose_one_BF16_GEMM_SM100_TUNING_CONFIG)rZ   r[   r   r   r^   r_   r&  runnersuse_sm_100ftunerrG   runnerrQ   s                r0   r   r   0  s     G%ah//KL  9;;TTVVWWW%ag{;;KKMM	
 	
 	
 /////7MOOED#s$45F%%&	 NFF F&((((((r2   r\   r]   c                 F   g }d|v r3|                     t                                                                 d|v r3|                     t                                                                 d|v r3|                     t	                                                                 d|v r!|                     t                                 |s
J d            t          j                    }| |||||g}	|	                    d|t          |	          \  }
} |
|	|           d S )Ncutlass_sm10xcutlass_sm12xcublascudnnr(  r  r)  )r   r  r   r   r   rm   _cudnn_gemm_fp8_runnerr   r,  r-  _FP8_GEMM_SM100_TUNING_CONFIGrZ   r[   r\   r]   r^   r_   r&  r/  r1  rG   r2  rQ   s               r0   fp8_gemm_sm100r;  O  s0    G,&&8::RRTTUUU,&&8::RRTTUUU<((??AABBB,-//000/////7MOOEGWc+;<F%%%	 NFF F&((((((r2   op_name
tuner_namec                 .      fd}t          |          S )z2Helper function to create cutlass FP4 GEMM module.c                  B     G fddt                     }  |             S )Nc                       e Zd Z fdZdeej                 dedee         f fdZ		 	 ddeej                 ded	e
f fd
ZdS )^_create_cutlass_fp4_gemm_module.<locals>.cutlass_fp4_gemm_runner.<locals>.CutlassFp4GemmRunnerc                      j         | _        d S r,   )fp4_gemm_fp4_gemm_runner)rN   r`   s    r0   __init__zg_create_cutlass_fp4_gemm_module.<locals>.cutlass_fp4_gemm_runner.<locals>.CutlassFp4GemmRunner.__init__t  s    (.%%%r2   rG   rH   rI   c                 `    t          t                                                              S r,   )r  r  fp4_gemm_tactic_numr  s      r0   rO   zp_create_cutlass_fp4_gemm_module.<locals>.cutlass_fp4_gemm_runner.<locals>.CutlassFp4GemmRunner.get_valid_tacticsw  r	  r2   rP   FrQ   rR   c           
         |\
  }}}}}	}
}}
}
}|j         t          j        k    r4|j         t          j        k    r|                    t          j                  }|j         t          j        k    r4|j         t          j        k    r|                    t          j                  }                    ||j        ||j        |	|||           |S r,   )r   rT   uint8float8_e4m3fnr   rC  T)rN   rG   rQ   rR   rX   rZ   r[   	a_descale	b_descalealphar!  r^   r_   r`   s                r0   ra   zf_create_cutlass_fp4_gemm_module.<locals>.cutlass_fp4_gemm_runner.<locals>.CutlassFp4GemmRunner.forward~  s    $ $7ek))ioAT.T.T )u{ ; ;I7ek))ioAT.T.T )u{ ; ;IqsIy{E3@PRX   
r2   Nrb   )rd   re   rf   rE  r   rT   rg   r   rh   rO   ri   ra   rj   s   r0   CutlassFp4GemmRunnerrA  s  s        8 8 8 8 8AU\*A -A c	A A A A A A !',	 U\*  !%	       r2   rO  rl   )rO  r`   s    r0   cutlass_fp4_gemm_runnerz@_create_cutlass_fp4_gemm_module.<locals>.cutlass_fp4_gemm_runnerr  sI    %	 %	 %	 %	 %	 %	 %	= %	 %	 %	N $#%%%r2   )rP  r   )r`   r<  r=  rP  s   `   r0   _create_cutlass_fp4_gemm_modulerQ  o  s9    (& (& (& (& (&T  7   r2   c                  d    t                                                      } t          | dd          S )z&Get the SM100/103/110 FP4 GEMM module.zflashinfer::cutlass_fp4_gemmcutlass_fp4_gemm)r"   r~   rQ  rj   s    r0   !get_gemm_sm100_module_cutlass_fp4rT    s6     /00??AAF*.0B  r2   c                  d    t                                                      } t          | dd          S )z"Get the SM120/121 FP4 GEMM module.z"flashinfer::cutlass_fp4_gemm_sm120cutlass_fp4_gemm_sm120)r!   r~   rQ  rj   s    r0   !get_gemm_sm120_module_cutlass_fp4rW    s6     /00??AAF*46N  r2   sm_majorc                 r    | dv rt                      S | dk    rt                      S t          d|            )N)r>         zUnsupported SM major version: )rT  rW  r   )rX  s    r0   get_cutlass_fp4_gemm_moduler\    sG     80222	R0222D(DDEEEr2   r   r0  c                 r    t          | |                                          fd}t          |          S )a  
    Get and build the TGV GEMM module for the specified dtype.

    Args:
        dtype: Data type for the GEMM operation (torch.bfloat16 or torch.float16)
        use_sm_100f: Whether to compile with SM100f flags (default: False), which makes the compiled kernel
            compatible with both B200 and B300 GPUs. However, it's only available with CUDA 12.9+.

    Returns:
        SimpleNamespace with the runner function
    c                  B     G fddt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         f fdZ	 	 ddeej                 dede	dej        f fd	Z
d
S )Iget_tgv_gemm_sm10x_module.<locals>.tgv_gemm_runner.<locals>.TGVGemmRunnerrG   rH   rI   c                 Z    j         }t          t           |                                S r,   )tgv_gemm_tactic_numr  r  )rN   rG   rH   	tactic_fnr`   s       r0   rO   z[get_tgv_gemm_sm10x_module.<locals>.tgv_gemm_runner.<locals>.TGVGemmRunner.get_valid_tactics  s)     #6	E))++..///r2   rP   FrQ   rR   c                     |^}}}}}	}
j         } ||                                |                                |||	|           |	S r,   )tgv_gemmt)rN   rG   rQ   rR   rX   rZ   r[   r   r   r^   r!  gemm_fnr`   s               r0   ra   zQget_tgv_gemm_sm10x_module.<locals>.tgv_gemm_runner.<locals>.TGVGemmRunner.forward  sO     ,2(1dCq
 !/qssuudFC===
r2   Nrb   rc   rj   s   r0   TGVGemmRunnerr`    s        0U\*0 -0 c	0 0 0 0 0 0 !',	 U\*  !%	        r2   rh  rl   )rh  r`   s    r0   r+  z2get_tgv_gemm_sm10x_module.<locals>.tgv_gemm_runner  sE    	 	 	 	 	 	 	M 	 	 	6 }r2   )r+  )r&   r~   r   )r   r0  r+  r`   s      @r0   r*  r*    sU     'uk::IIKKF    > '   r2   c                    t          | j        ddg          st          d          | j        t          j        t          j        fvrt          d| j         d          | j        |j        k    r t          d| j         d|j         d          |:t	          j        | j        d
         |j        d         f| j        | j                  }n|j        | j        d
         |j        d         fk    r3t          d| j        d
         |j        d         f d|j         d          |j        | j        k    r t          d| j         d|j         d          |j        | j        k    r t          d| j         d|j         d          g }t          | j                  }|
                    t          | j        |                                                     t          j                    }d
}t          t!          |fdt"          t$                    ft'          ddd           f          }	| ||||g}
| j        t          j        k    rdnd}|                    | d||	|
          \  }} ||
|          S )a+  
    Perform TGV GEMM on SM100 architecture with automatic dtype detection.

    Computes: A @ B + bias

    Args:
        a: First input tensor of shape (M, K) in row-major layout
        b: Second input tensor of shape (K, N) in column-major layout
        bias: Bias tensor of shape (N,)
        pdl: Whether to use PDL (persistent data loader), defaults to False
        out: Optional output tensor, shape (M, N), defaults to None.

    Returns:
        Output tensor of shape (M, N) in row-major layout

    Supported dtypes:
        - torch.bfloat16
        - torch.float16

    Note:
        - Requires SM100, SM103, or SM110 architecture
        - Input tensors a and b must have the same dtype
        - Tensor b is expected to be in column-major layout (transposed from typical PyTorch row-major)
    100103z+TGV GEMM requires SM100, SM103 architecturezUnsupported dtype z*. Only bfloat16 and float16 are supported.z,Input tensors must have the same dtype. Got  and r   Nr   r   r   r   r   r   r   r  r  r   c                     | d         d         S r  rL   r  s    r0   r  z tgv_gemm_sm100.<locals>.<lambda>I  s    vay} r2   r  bf16fp16	_tgv_gemmr)  )rB   r;   r   r   rT   r   float16r   r   r   r   r*  r+  r   r,  r   r   r   r   r   r-  )rZ   r[   r   r   r^   r/  r0  r1  a_tensor_indextuning_configrG   	dtype_strr2  rQ   s                 r0   tgv_gemm_sm100ru    s   B QXu~66 HFGGG 	wu~u}555TTTT
 
 	
 	w!'S17SSSSS
 
 	
 {kWQZ$8'
 
 
 9QWQZ000_AGAJ
3K__SVS\___   :!!QAHQQCJQQQ   9N17NN#)NNN   G%ah//KNN,QWkBBRRTTUUUMOOEN !6(	 
 ,, 
  M$ D#s#F'U^33I%%	 NFF 6////r2   c                  4   t                                                      t          dd          dt          j        dt          j        dt          j        dt          j        dt          j        d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt
          dd ffd            } t          d          dt          j        dt          j        dt          j        dt          j        dt          j        d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt
          dd fd            }t          |           S )Nz%flashinfer::cutlass_segment_gemm_sm90)r_   rn   ro   r_   int_workspace_bufferrq   rr   rs   rt   x_stridew_stridey_stridern   rx   empty_y_datary   rI   c                 H                         | |||||||||
||           d S r,   cutlass_segment_gemm_sm90)r_   rw  rq   rr   rs   rt   rx  ry  rz  rn   rx   r{  ry   r`   s                r0   r~  z7get_gemm_sm90_module.<locals>.cutlass_segment_gemm_sm90`  sK    & 	(( 	
 	
 	
 	
 	
r2   c                     d S r,   rL   )r_   rw  rq   rr   rs   rt   rx  ry  rz  rn   rx   r{  ry   s                r0   _fake_cutlass_segment_gemm_sm90z=get_gemm_sm90_module.<locals>._fake_cutlass_segment_gemm_sm90  s	      	r2   r}  )r   r~   r8   rT   rg   ri   r9   r   )r~  r  r`   s     @r0   get_gemm_sm90_moduler  Z  s   !##2244F /.  
,
#l
 l
 	

 
 
 ,
 ,
 ,
 <
 l
 l
 "
 

 
 
 
 
	 
< =>>,#l l 	
   , , , < l l " 
   ?>$ ";   r2   r   weightsrn   w_column_majorr   
seg_indptrweight_indicesc                    | j         }t          j        }t          j        }	t          j        }
|                    |	          }||                    |	          }|r|                    d          n|                    d          }|r|                    d          n|                    d          }t          j        |df||          }t          j        ||	|          }t          j        ||	|          }t          j        ||	|          }t          j        ||
|          }t          j        ||
|          }t          j        ||
|          }ddlm}  ||f         |||||||| |||||||           |||||||fS )Nr   r      r   )compute_sm80_group_gemm_args)	r;   rT   int32int64tosizer   triton.gemmr  )r   r  rn   r  r   r  r  r;   	prob_typeptr_typeld_typed_outd_inrq   rr   rs   rt   x_stride_dataw_stride_datay_stride_datar  s                        r0   #launch_compute_sm80_group_gemm_argsr    s    XFI{HkGx((J!'**844-BGLLOOO7<<??E,A7<<???',,q//D;
AiOOOL[8FCCCF[8FCCCF[8FCCCFK
'&IIIMK
'&IIIMK
'&IIIM::::::/ */		  & 	 r2   c                    | j         }t          j        }t          j        }	t          j        }
|                    |	          }||                    |	          }|r|                    d          n|                    d          }|r|                    d          n|                    d          }t          j        |df||          }t          j        ||	|          }t          j        ||	|          }t          j        ||	|          }t          j        ||
|          }t          j        ||
|          }t          j        ||
|          }ddlm}  ||f         |||||||| |||||||           |||||||fS )Nr   r   r  r   )compute_sm90_group_gemm_args)	r;   rT   r  r  r  r  r   r  r  )r   r  rn   r  r   r  r  r;   r  r  stride_typer  r  rq   rr   rs   rt   r  r  r  r  s                        r0   #launch_compute_sm90_group_gemm_argsr    s    XFI{H+Kx((J!'**844-BGLLOOO7<<??E,A7<<???',,q//D;
AiOOOL[8FCCCF[8FCCCF[8FCCCFK
+fMMMMK
+fMMMMK
+fMMMM::::::/ */		  & 	 r2   c                      e Zd ZdZ	 ddej        deddfdZdej        dej        ddfd	Ze		 	 	 	 dd
ej        dej        de
dedeej                 deej                 deej                 deej                 dej        fd            ZeZdS )SegmentGEMMWrappera  Wrapper for segment GEMM kernels.

    Example
    -------
    >>> import torch
    >>> from flashinfer import SegmentGEMMWrapper
    >>> # create a 1MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device="cuda")
    >>> segment_gemm = SegmentGEMMWrapper(workspace_buffer)
    >>> seq_lens = torch.tensor([1, 2, 3, 4], dtype=torch.int64, device="cuda")
    >>> # create packed input tensor (10 = 1 + 2 + 3 + 4)
    >>> x = torch.randn(10, 128, device="cuda", dtype=torch.float16)
    >>> # create weight tensor with 4 weights, each with 128 input and 256 output channels, column major
    >>> weights = torch.randn(4, 256, 128, device="cuda", dtype=torch.float16)
    >>> # compute the segment GEMM
    >>> y = segment_gemm.run(x, weights, 4, True, seg_lens=seq_lens)
    >>> y.shape
    torch.Size([10, 256])
    >>> y_ref_0 = torch.matmul(x[:1], weights[0].t())
    >>> torch.allclose(y[:1], y_ref_0)
    True
    >>> y_ref_1 = torch.matmul(x[1:3], weights[1].t())
    >>> torch.allclose(y[1:3], y_ref_1)
    True
    >>> y_ref_2 = torch.matmul(x[3:6], weights[2].t())
    >>> torch.allclose(y[3:6], y_ref_2)
    True
    >>> y_ref_3 = torch.matmul(x[6:], weights[3].t())
    >>> torch.allclose(y[6:], y_ref_3)
    True
    >>>
    >>> # another example with weight indices
    >>> weight_indices = torch.tensor([0, 1, 0, 1], dtype=torch.int64, device="cuda")
    >>> y = segment_gemm.run(x, weights, 4, True, seg_lens=seq_lens, weight_indices=weight_indices)
    >>> y.shape
    torch.Size([10, 256])
    >>> y_ref_0 = torch.matmul(x[:1], weights[0].t())
    >>> torch.allclose(y[:1], y_ref_0)
    True
    >>> y_ref_1 = torch.matmul(x[1:3], weights[1].t())
    >>> torch.allclose(y[1:3], y_ref_1)
    True
    >>> y_ref_2 = torch.matmul(x[3:6], weights[0].t())
    >>> torch.allclose(y[3:6], y_ref_2)
    True
    >>> y_ref_3 = torch.matmul(x[6:], weights[1].t())
    >>> torch.allclose(y[6:], y_ref_3)
    True
    r   float_workspace_bufferr   rI   Nc                 x    t          j        dt           j        |j                  | _        || _        || _        dS )a  Initialize the wrapper.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The workspace buffer for the kernels, we use it for storing intermediate results in cutlass
            segment GEMM kernels. Encouraged size is 128MB.
        )i   r   N)rT   r   int8r;   _int_workspace_buffer_float_workspace_bufferr   )rN   r  r   s      r0   rE  zSegmentGEMMWrapper.__init__I  s?     &+[%*5K5R&
 &
 &
" (>$r2   rw  c                 "    || _         || _        dS )a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer for the kernels.
        int_workspace_buffer : torch.Tensor
            The new int workspace buffer for the kernels.
        N)r  r  )rN   r  rw  s      r0   reset_workspace_bufferz)SegmentGEMMWrapper.reset_workspace_bufferZ  s     (>$%9"""r2   r   r  r   ry   r^   seg_lensr  r  c	                 h   ||t          d          |"t          |                    |                    }| t          j        dt          j                  }|                    d          }	|r|                    d          n|                    d          }
|Bt          |          rt          j        }n|j	        }t          j
        |	|
f||j                  }n)|j        |	|
fk    rt          d|	|
f d	|j                   t          j        d|j	        |j                  }t          j        d|j	        |j                  }| j        d
k    rt          |j                  }n| j        }|dk    rUt          |||||||          \  }}}}}}}t!                                          | j        | j        |||||||||||           nf|dk    rNt)          |||||||          \  }}}}}}}t+                                          | j        ||||||||||           nt          d|           |S )a  Run the segment GEMM kernel.

        Compute the matrix multiplication between a batch of input tensor (with variable number of rows, but fixed
        number of columns) and a batch of weight tensor with fixed number of rows and columns:

        .. math::

            y[i] = x[i] \times W[i]

        if :attr:`weight_indices` is provided, we will select the weight tensor based on the indices in the
        :attr:`weight_indices` tensor:

        .. math::

            y[i] = x[i] \times W[\text{weight_indices}[i]]

        We use Ragged Tensor to represent the input tensor :attr:`x` and the output tensor :attr:`y`, and each x[i]
        is a segment of the concatenated tensor. Please see :ref:`Ragged Tensor tutorial <kv-layout>` for more details.
        We use a ``seg_len`` or ``seg_indptr`` tensor (either would work) to indicate the start and end of each segment,
        where the ``seg_indptr`` is the cumulative sum of the ``seg_lens`` tensor (with an additional 0 at the beginning):

        .. math::

            \text{seg_indptr}[i] = \sum_{j=0}^{i-1} \text{seg_lens}[j], \quad \text{seg_indptr}[0] = 0

        - If ``seg_lens`` is provided, then :attr:`x` has shape ``(sum(seg_lens), d_in)`` and :attr:`y` has shape
            ``(sum(seg_lens), d_out)``, where ``d_in`` is the number of columns of the input tensor and ``d_out`` is the
            number of columns of the output tensor.
        - If ``seg_indptr`` is provided, then :attr:`x` has shape ``(seg_indptr[-1], d_in)`` and :attr:`y` has shape
            ``(seg_indptr[-1], d_out)``.

        Parameters
        ----------
        x : torch.Tensor
            The input tensor with shape ``(sum(seg_lens), d_in)``.
        weights : torch.Tensor
            The 3D weight tensor with shape ``(num_weights, d_in, d_out)`` if :attr:`weight_column_major` is ``False``,
            or ``(num_weights, d_out, d_in)`` if :attr:`weight_column_major` is ``True``.
        batch_size : int
            The number of segments.
        weight_column_major : bool
            Whether the weight tensor is column major.
        out : Optional[torch.Tensor]
            The output tensor, with shape ``(sum(seg_lens), d_out)``.
            If not provided, a new tensor will be created internally.
        seg_lens : Optional[torch.Tensor]
            The length of each segment, with shape ``(batch_size,)``, expects a 1D tensor of dtype ``torch.int64``.
        seg_indptr : Optional[torch.Tensor]
            The indptr of the segments, with shape ``(batch_size + 1,)``, expects a 1D tensor of dtype ``torch.int64``.
            If this is provided, then :attr:`seg_lens` will be ignored, otherwise ``seg_indptr`` will be computed
            internally from :attr:`seg_lens`.
        weight_indices : Optional[torch.Tensor]
            The indices of the weight tensor to be selected for each segment, with shape ``(batch_size,)``.
            Expects a 1D tensor of dtype ``torch.int64``.
            If this is provided, then the weight tensor will be selected based on the indices in this tensor.

        Returns
        -------
        torch.Tensor
            The output tensor with shape ``(sum(seg_lens), d_out)``.
        Nz1Either seg_lens or seg_indptr should be provided.r   r   r   r   r   z'Output tensor shape mismatch, expected r   r   sm90sm80zUnsupported gemm backend: )r   r6   r  rT   r   r  r  r7   r   r   r   r;   r   r   r5   r  r  r~  r  r  r  r   r{   )rN   r   r  r   ry   r^   r  r  r  cumulative_batch_sizer  r   rx   r{  r   rq   rr   rs   rt   r  r  r  	x_ld_data	w_ld_data	y_ld_datas                            r0   runzSegmentGEMMWrapper.runi  s   R 
 2PQQQ#HKKNN33J!"[%+>>>N !q		#6KQGLLOO;|| $!N		G	+&.i  CC y2E::: m>SUZ=[mmbebkmm   {1AGAHEEE{1CIcjIII<6!!,QX66GGlGf 4#  !""<<,*#     4#  22*#    C'CCDDD
r2   )r   NNNN)rd   re   rf   __doc__rT   rg   strrE  r  r   rh   ri   r   r  ra   rL   r2   r0   r  r    sF       0 0f DJ &+l=@	   ":&+l:JO,:	: : : :  '++/-115h h<h h 	h
 "h el#h 5<(h U\*h !.h 
h h h ^hT GGGr2   r  c                   2    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
S )UIDszUIDs for CUDNN graph tensorsr   r   r   r  r           N)rd   re   rf   r  A_UIDB_UID	ALPHA_UIDBLOCK_DESCALE_A_UIDBLOCK_DESCALE_B_UIDA_SCALE_UIDB_SCALE_UIDO_UIDrL   r2   r0   r  r    s>        &&EEIKKEEEr2   r  c                  2    t           st          d          dS )z7Check if cuDNN is available and raise exception if not.zcuDNN is not available. Please install cuDNN to use FP8 GEMM functions. You can install it with: pip install nvidia-cudnn-cu12 nvidia-cudnn-frontendN)CUDNN_AVAILABLERuntimeErrorrL   r2   r0   _check_cudnn_availabilityr  $  s,     
[
 
 	

 
r2   c                     t                       	 t          j        } t          t          |                     d          dd                   \  }}||fdk     rt          d|  d          n5# t          t          t          t          f$ r}t          d          |d}~ww xY w	 t          j                    }|dk     rt          d	| d
          dS # t          t          f$ r}t          d          |d}~ww xY w)zCCheck if cuDNN FP4 support is available and raise exception if not.r   Nr   )r      z(cuDNN FP4 requires version 1.13+, found zH. Upgrade: pip install --upgrade nvidia-cudnn-cu12 nvidia-cudnn-frontendz<Unable to determine cuDNN version. FP4 requires cuDNN 1.13+.izc z3cuDNN FP4 requires backend version >= 91002, found z. Please upgrade cuDNN backend.zIUnable to determine cuDNN backend version. FP4 requires backend >= 91002.)r  r7  __version__maprh   splitr  ImportErrorAttributeErrorr   
IndexErrorbackend_version	TypeError)version_strr?   r@   er  s        r0   _check_cudnn_fp4_availabilityr  -  sS   '3 1 1# 6 6rr :;;u5>G##Z; Z Z Z   $
 Z@   J
 
	
/11U""1o 1 1 1   #"
 I&   W
 
	s0   AA+ +BBB!,C C5 C00C5c                  H    t          j                    } d}d}| |k    p| |k    S )z;Check if cuBLAS backend for FP4 GEMM is available in cuDNN.ic id )r7  r  )r  CUDNN_VERSION_9_11_1CUDNN_VERSION_9_13_0s      r0   !_is_cublas_fp4_available_in_cudnnr  N  s:     +--O  // 	322r2   streamc                     t           !t                       t          j                    a t          j        t           | j                   t           S )z(Create and return a cached cuDNN handle.)_cudnn_handler  r7  create_handle
set_streamcuda_stream)r  s    r0   _get_cudnn_handler  _  s@     !###+--	]F$6777r2   c                 `    | t           j        t           j        fvrt          d|  d          dS )6Validate that the output dtype is either bf16 or fp16.Unsupported output dtype: zN. Only torch.bfloat16 and torch.float16 are supported for FP8 GEMM operations.NrT   r   rq  r   r  s    r0   _validate_fp8_output_dtyper  i  sJ    U^U]333\ \ \ \
 
 	
 43r2   c                 `    | t           j        t           j        fvrt          d|  d          dS )r  r  zO. Only torch.bfloat16 and torch.float16 are supported for BF16 GEMM operations.Nr  r  s    r0   r   r   r  sJ    U^U]333] ] ] ]
 
 	
 43r2   c                 x   t           j                            |          }t          j        t          |                    5 \  }}|rt          j        j        nt          j        j        }|	                    d| ||          }|	                    d|||          }|	                    d|||t          j
        j                  }|	                    d|||t          j
        j                  }|                    ||d|
gd	          }|                    t          j        j                   |                    |||
dgd
	          }|                    t          j        j                   |                    ||t          j        j        d          }|                    t          j        j                   |}|rt|	                    dddt          j        j                  }|                    d||t          j        j                  }|                    t$          j        j                   |                    d                              d                              |	           |                    t$          j        j                   |                    t$          j        j                   |                    t$          j        j                   |                    t$          j        j                   |                    t$          j        j                   |                                 |                                 |                    t          j        j         t          j        j!        g           |r$tE                      s|#                    dg           |cd d d            S # 1 swxY w Y   d S )NrZ   namer   stride	data_typer[   block_descale_ar  r   r  r  reordering_typeblock_descale_br   	dequant_a
block_sizer  	dequant_bgemmcompute_data_typer  global_scaler   r   r   	scale_mulr  rZ   r[   r  c_finalTeng0)$rT   rU   current_streamr7  graphr  r  FP8_E4M3FP8_E8M0tensortensor_reorderingF8_128x4block_scale_dequantizeset_data_typeFLOATmatmulmulset_uidr  r  valueset_name
set_outputr  r  r  r  r  validatebuild_operation_graphcreate_execution_plans	heur_moder   r   r  deselect_engines)a_shapea_strideb_shapeb_stridea_descale_shapea_descale_strideb_descale_shapeb_descale_strideab_typeo_typer  r;   alpha_is_not_none	use_nvfp4r  r  r!  
scale_typea_cudnn_tensorb_cudnn_tensorblock_descale_a_cudnn_tensorblock_descale_b_cudnn_tensordequant_a_tensordequant_b_tensorc_tensorc_final_cudnn_tensorglobal_scale_cudnn_tensors                              r0   %create_cudnn_execution_plans_fp4_gemmr!  {  s   " Z&&v..F	&v..	/	/ P:E11:XU_--@X
'(g & 
 
 '(g & 
 
 (-||"# !3< (4 (
 (
$ (-||"# !3< (4 (
 (
$ !77(:	 8 
 
 	&&u'<=== 77("A	 8 
 
 	&&u'<===<<#o3	   
 
 	u4555' 	D(-# //	 )5 ) )% $)99 +"'/"7	 $- $ $  &--dn.BCCC%%i00;;DAAOOPVWWWtz/000tz/000$,,T-E-KLLL$,,T-E-KLLL$$TZ%5666##%%%$$eo&79J%KLLL  	-(I(K(K 	-""F8,,,aP P P P P P P P P P P P P P P P P Ps   M!N//N36N3rP   rQ   c                     t          | |||||||||	|
|||          }|                                 |dk    r|                    |           n|                                 |S r   )r!  check_supportbuild_plan_at_indexbuild_plans)r  r  r  r  r  r  r  r  r  r  r  r;   rN  r  rQ   r  s                   r0    build_plans_cudnn_fp4_gemm_graphr&    s    & 2 E" 
||!!&))))Lr2   c	           
      H   t           j        j        |                    t	                                t           j        j        |                    t	                                t           j        j        |t           j        j        |t           j        j        |i}	|1|                    t          j
                  |	t           j        j        <   |                                |                                 k     r8t          j        |                                 |j        t          j                  }t          j                            |j                  }
|dk    r'|                     |	|t)          |
                     d S |                     |	||t)          |
                     d S Nr   rP   handle)r  r  r  r   r   r  r  r  r  rT   floatr  r   get_workspace_sizer   r;   rI  rU   r  executer  execute_plan_at_index)r  rZ   r[   rL  rM  rN  r  r_   rQ   variant_packr  s              r0   execute_cudnn_gemm_fp4_graphr0    s_    	
!&&!5!7!788
!&&!5!7!788 &	 &	
'L -2ZZ-D-DT^)*%":":"<"<<< ;$$&&qxu{
 
 
 Z&&qx00F||l$4=Nv=V=VWWWWW##*F;LV;T;T 	$ 	
 	
 	
 	
 	
r2   c           
      F   t           j        j        |t           j        j        |t           j        j        |t           j        j        |t           j        j        |i}|                                 }	|                                |	k     r&t          j
        |	|j        t          j                  }t          j                            |j                  }
|dk    r'|                     ||t!          |
                     d S |                     |||t!          |
                     d S r(  )r  r  r  r  r  r  r  r,  r   rT   r   r;   rI  rU   r  r-  r  r.  )r  rZ   r[   rL  rM  r  r_   rQ   r/  workspace_sizer  s              r0   execute_cudnn_gemm_mxfp8_graphr3  1  s    	
!
! &	 &	
'L --//N.00 ;185;
 
 
 Z&&qx00F||l$4=Nv=V=VWWWWW##*F;LV;T;T 	$ 	
 	
 	
 	
 	
r2   c                 &   t                       t          j                            |          }t	          j        t          |                    5 \  }	}
|	                    d| ||          }|	                    d|||          }|	                    dddt          j        j	                  }|	                    dddt          j        j	                  }|	
                    d||t          j        j	                  }|                    d	                              t          j        j	                   |	                    d
||t          j        j	                  }|	                    d||t          j        j	                  }|                    d                              d                              |           |                    t           j        j                   |                    t           j        j                   |                    t           j        j                   |                    t           j        j                   |                    t           j        j                   |	                                 |	                                 |	                    t          j        j        t          j        j        g           |	                                 |	                                 |	cddd           S # 1 swxY w Y   dS )a  Build a cuDNN graph for GEMM with per-tensor quantization.

    This function is cached to avoid rebuilding identical graphs.

    Args:
        a_shape: Shape of tensor A
        a_stride: Stride of tensor A
        b_shape: Shape of tensor B
        b_stride: Stride of tensor B
        a_type: Data type for input tensor A
        b_type: Data type for input tensor B
        o_type: Data type for output tensor

    Returns:
        cuDNN graph object
    rZ   r  r[   a_scaler  b_scaler   )r  r   r   r  cscale_mul_ar  scale_mul_br  TN)r  rT   rU   r  r7  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r   FALLBACKr#  r%  )r  r  r  r  a_typeb_typer  r;   r  r  r!  r  r  a_scale_cudnn_tensorb_scale_cudnn_tensorc_cudnn_tensorc_after_scale_a_cudnn_tensorc_after_scale_b_cudnn_tensors                     r0   (build_cudnn_gemm_with_per_tensor_q_graphrB  T  s   ( Z&&v..F	&v..	/	/ 7:E1'(f & 
 
 '(f & 
 
  %||o+	  ,  
  
  %||o+	  ,  
  
 #o3	 & 
 
 	$$225?3HIII',yy"#o3	 (1 (
 (
$ (-yy*"#o3	 (1 (
 (
$ 	%--i88CCDIIWW	
 	
 	
 	tz/000tz/000$$T%5%;<<<$$T%5%;<<<$,,TZ-=>>>##%%%$$eo&79Q%RSSSo7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7s   J*LL
L
c           
         t           j        j        |t           j        j        |t           j        j        |t           j        j        |t           j        j        |i}t          j        	                    |j
                  }t          |          }	|                                |                                 k     r8t          j        |                                 |j
        t          j                  }|                     |||	           d S )Nr   r)  )r  r  r  r  r  r  r  rT   rU   r  r;   r  r   r,  r   rI  r-  )
r  rZ   r[   r5  r6  r  	workspacer/  r  cudnn_handles
             r0   *execute_cudnn_gemm_with_per_tensor_q_graphrF    s     	
!
!
'L Z&&qx00F$V,,L5335555K$$&&qxu{
 
 
	 
MM,	,M?????r2   c                 .   | t           j        k    rt          j        j        S | t           j        k    rt          j        j        S | t           j        k    rt          j        j        S | t           j	        k    rt          j        j
        S t          d|            )NzUnsupported dtype: )rT   r   r7  r  BFLOAT16rq  HALFrJ  r  float8_e5m2FP8_E5M2r   r  s    r0   #_torch_data_type_to_cudnn_data_typerL    s{    ''	%-		##	%%	%	%''	%#	#	#''6u66777r2   rD  r5  r6  torch_out_dtypec                 @   t                       t          |j        |                                |j        |                                t	          |j                  t	          |j                  t	          |          |j                  }t          |||||||            |S r,   )r  rB  r   r  rL  r   r;   rF  )rD  rZ   r[   r5  r6  r^   rM  r  s           r0   _cudnn_gemm_fp8rO    s     4		

		

+AG44+AG44+O<<		 	E /q!WgsI   Jr2   c                  <     G d dt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         fdZ	 	 ddeej                 dede	dej        fd	Z
d
S )2_cudnn_gemm_fp8_runner.<locals>.CudnnFp8GemmRunnerrG   rH   rI   c                     dgS rK   rL   rM   s      r0   rO   zD_cudnn_gemm_fp8_runner.<locals>.CudnnFp8GemmRunner.get_valid_tactics  s     3Jr2   rP   FrQ   rR   c           	      L    |\  }}}}}	}
t          |
|||||	|	j                   |	S r,   )rO  r   rN   rG   rQ   rR   rX   rZ   r[   r\   r]   r^   r_   s              r0   ra   z:_cudnn_gemm_fp8_runner.<locals>.CudnnFp8GemmRunner.forward  s:     =C9Aq'7C)9,aGWc39UUUJr2   Nrb   rc   rL   r2   r0   CudnnFp8GemmRunnerrR    s        	&	 )	 #Y		 	 	 	 #(			 		&		 		 !			 \		 		 		 		 		 		r2   rV  rl   )rV  s    r0   r8  r8    s=        ]   ( r2   c                    |                      d          dk    }t          | j                  }t          |                                            }t          |          dk    r>|                    dd           |                    d|                                            ||rdndxx         dz  cc<   |rC|dxx         dz  cc<   t          t          |          dz
            D ]}||xx         dz  cc<   n2t          t          |          dz
            D ]}||xx         dz  cc<   t          |          t          |          fS )Nr   r   r   r   rP   )r  r  r   leninsertr   r  tuple)packed_fp4_tensoris_column_major
real_shapereal_strideis        r0   %_get_real_fp4_shape_from_packed_uint8r`    sp   '..r22a7O'-..J(//1122K :!!Q1/5577888 _,rr"---2---  B1s;''!+,, 	  	 ANNNaNNNN	  s;''!+,, 	  	 ANNNaNNNN*u[1122r2   c                    t          | j                  }t          |                                           }t          |          dk    r|                    d|           |                    dd           |                     d          dk    }|rdnd}||         |z  dk    sJ ||         |z  ||<   ||         ||         z  |d<   n't          |          dk    rnt          d| d          t          |          t          |          fS )Nr   r   r   r   r  z&Unsupported block scale tensor shape: z, expected 2d or 3d.)r  r   r  rX  rY  r   rZ  )block_scale_tensorr   block_scale_shapeblock_scale_strider\  
expand_dims         r0    _expand_block_scale_tensor_shaperf    s@   /56607799::
""  J///!!!Q''' -33B771<)0QQq
 ,z9Q>>>>(9*(E(S*%z*->z-JJ 	1 
		1	$	$\5F\\\
 
 	
 #$$e,>&?&?@@r2   trtllm_low_latencyrN  c                 
   t           j        f}d}|dk    r| j        d         }|j        d         }	nt          d| d| d          |9||vrt          d	| d| d          t          j        ||	f| j        |
          }n|j        |vrt          d	|j         d| d          |j        | j        d         |j        d         fk    r3t          d| j        d         |j        d         f d|j         d          |j        | j        k    r t          d| j         d|j         d          |&|j        |k    rt          d| d|j         d          |dk    rt          | |||           nt          d| d| d          |S )aV  FP8 matrix multiplication.

    Parameters
    ----------
    a: torch.Tensor
        Input tensor, shape (m, k), fp8 e4m3.

    b: torch.Tensor
        - When using "trtllm_low_latency" backend,
          Weight tensor, shape (k // block_size, n, block_size), fp8 e4m3
          B needs to be pre-processed using `prepare_low_latency_gemm_weights`.
          block_size is 128 for e4m3.

    alpha: Optional[torch.Tensor]
        Scale tensor for the output, float. If None, defaults to 1.0 for no scaling.

    out_dtype: torch.dtype
        Output tensor data type. Default is torch.bfloat16.

    out: Optional[torch.Tensor]
        Output tensor, shape (m, n). If None, a new tensor will be allocated.

    backend: Literal["trtllm_low_latency"]
        Backend to use for computation. Default is "trtllm_low_latency".
        - "trtllm_low_latency": optimized for small M dimension.

    Returns
    -------
    torch.Tensor
        Output tensor of shape (m, n) with dtype `out_dtype`.

    Examples
    --------
    >>> import torch
    >>> from flashinfer import mm_fp8, prepare_low_latency_gemm_weights
    >>> m = 16
    >>> n = 2560
    >>> k = 32768
    >>> a = torch.randn([m, k], device="cuda", dtype=torch.bfloat16)
    >>> a_fp8, a_inv_s = to_float8(a, dtype=torch.float8_e4m3fn)
    >>> b = torch.randn([n, k], device="cuda", dtype=torch.bfloat16)
    >>> b_fp8, b_inv_s = to_float8(b, dtype=torch.float8_e4m3fn)
    >>> prepared_b = prepare_low_latency_gemm_weights(b_fp8)
    >>> alpha = a_inv_s * b_inv_s
    >>> out = mm_fp8(a_fp8, prepared_b, alpha)
    >>> out.shape
    torch.Size([16, 2560])
    )rg  rg  r   r   zUnsupported backend: z. Only z' are supported for FP8 GEMM operations.Nr  r   r   r   r   r   r   )rT   r   r   r   r   r;   r   r
   )
rZ   r[   rN  r   r^   r   supported_out_dtypessupported_backendsmns
             r0   mm_fp8rm  1  sx   t "N,0&&&GAJGAJPG P P&P P P
 
 	
 {000VY V V,V V V   kF8
 
 
 9000VSY V V,V V V   9QWQZ000]171:qwqz2J]]QTQZ]]]   :!!QAHQQCJQQQ    SY)%;%;P9PPCIPPP   &&&1eS1111PG P P&P P P
 
 	
 Jr2      rL  rM  r  r  c
                 ,   t          |           \  }
}t          |          \  }}|
d         }t          ||          \  }}t          ||          \  }}t          |
|||||||t          j        j        t          |          || j        |d u||	          }|S )Nr   rQ   )r`  rf  r&  r7  r  FP4_E2M1rL  r;   )rZ   r[   rL  rM  rN  r   r^   r  r  rQ   real_a_shapereal_a_stridereal_b_shapereal_b_stridebatchexpanded_a_descale_shapeexpanded_a_descale_strideexpanded_b_descale_shapeexpanded_b_descale_strider  s                       r0   _get_cudnn_fp4_gemm_graphr{    s     #H"J"JL-"G"J"JL-OE(E:: 87 	)E:: 87 - ! ! +I66	T  E" Lr2   c                 h    t          | |||||||||

  
        }t          || ||||||	|
	  	         d S )N
rZ   r[   rL  rM  rN  r   r^   r  r  rQ   rp  )r{  r0  )rZ   r[   rL  rM  rN  r   r^   r  r  r_   rQ   r  s               r0   _cudnn_gemm_fp4r~    sn     &

  E !q!Y	5#7GPV     r2   c                  <     G d dt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         fdZ	 	 ddeej                 dede	dej        fd	Z
d
S )2_cudnn_gemm_fp4_runner.<locals>.CudnnFp4GemmRunnerrG   rH   rI   c                     |\
  }}}}}}}	}
}}t          |||||||	|
|d
  
        }|                                }t          t          |                    S )NrP   r}  )r{  get_execution_plan_countr  r  )rN   rG   rH   rZ   r[   rL  rM  rN  r   r^   r  r  r_   r  	num_planss                  r0   rO   zD_cudnn_gemm_fp4_runner.<locals>.CudnnFp4GemmRunner.get_valid_tactics  s    "   .###%#  E 6688Ii(()))r2   rP   FrQ   rR   c                 T    |\
  }}}}}	}
}}}}t          |||||	|
|||||           d S )Nrp  )r~  )rN   rG   rQ   rR   rX   rZ   r[   rL  rM  rN  r   r^   r  r  r_   s                  r0   ra   z:_cudnn_gemm_fp4_runner.<locals>.CudnnFp4GemmRunner.forward	  sl    $        r2   Nrb   rc   rL   r2   r0   CudnnFp4GemmRunnerr    s        "	*&"	* )"	* #Y	"	* "	* "	* "	*N #(		 	&	 	 !		 \	 	 	 	 	 	r2   r  rl   )r  s    r0   _cudnn_gemm_fp4_runnerr    sF    D D D D D] D D DL r2   r   use_8x4_sf_layout)r7  trtllmr   r   c                    | j         dk    s|j         dk    rt          d| j         d|j                   | j        d         |j        d         k    r+t          d| j        d          d|j        d                    | j        t          j        t                      hvs!|j        t          j        t                      hvr t          d| j         d|j         d	          |j        t          j        t          j        hvs|j        t          j        t          j        hvr t          d
|j         d|j         d	          |,|j        t          j        k    rt          d|j                   |<|	                                dk    r$t          d|	                                           |t          j
        t          j        fvrt          d| d          |
r|dk    rt          d          |
s|dk    rt          d          dS )Nr   zmm_fp4 accepts 2d tensors, got rl  r   r   z1K dimension mismatch in mm_fp4. got a.shape[1] = z, b.shape[0] = z:a and b must have float4_e2m1fn_x2 packed into uint8. Got r   zIa_descale and b_descale must have float8_e4m3fnx2 packed into uint8. Got z"alpha must be a float tensor, got zalpha must be a scalar, got r  zN. Only torch.bfloat16 and torch.float16 are supported for FP4 GEMM operations.rn  z$nvfp4 only supports block_size = 16.    z$mxfp4 only supports block_size = 32.T)ndimr   r   r   rT   rI  r   rJ  r+  r   r   rq  rZ   r[   rL  rM  rN  r   r^   r  r  r   r  s              r0   _check_mm_fp4_problem_sizer  9	  sj    	v{{afkkR17RRRRSSSwqzQWQZg
gg[\[bcd[egg
 
 	
 	wu{$8$:$:;;;qwO @ @ ,7, ,!", , ,
 
 	
    
!4ek B	B	B<?< <)2< < <
 
 	
 U[EK77KekKKLLLU[[]]a//GGGHHH777\ \ \ \
 
 	

  AZ2%%?@@@ Ar))?@@@4r2   )r   r   n   x   y   c                    |rt          d          |
sAt          | j        dg          r+t          j                    dk     rt          t                    t                       t          |           \  }}t          |          \  }}|d         }t          ||          \  }}t          ||          \  }}t          ||||||||t          j        j        t          |          || j        ||
          }|                                 dS )N6Only TRTLLM FP4 GEMM supports 8x4 scale factor layout.120ie r   T)r   rB   r;   r7  r  r   )CUDNN_FP4_MXFP4_SM120_CUDNN_VERSION_ERRORr  r`  rf  r!  r  rq  rL  r#  )rZ   r[   rL  rM  rN  r   r^   r  r  r   r  rr  rs  rt  ru  rv  rw  rx  ry  rz  r  s                        r0   _cudnn_gemm_fp4_requirementr  q	  s2     SQRRRFah00F !##e++DEEE!###
 #H"J"JL-"G"J"JL-OE(E:: 87 	)E:: 87 2 ! ! +I66	 E  
4r2   c                 n    |
st          d          |t          j        k    rt          d| d          dS )N9Only cudnn and auto FP4 GEMM supports mxfp4 quantization.r  zB. Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations.T)r   rT   r   r  s              r0   _trtllm_gemm_fp4_requirementr  	  s\      VTUUUEN""P P P P
 
 	
 4r2   c                 J    |rt          d          |
st          d          dS )Nr  r  Tr   r  s              r0   _cutlass_gemm_fp4_requirementr  	  s9      SQRRR VTUUU4r2   r7  c                      t                      j        }t          r |dk    rt          j                    dk    rd}nd} fd|D             S )a  
    Heuristic function for mm_fp4 backend selection. Routes to either cudnn or cutlass.
    Note: trtllm is not considered in the backend selection because it requires a specific
    input quantization (swizzling/shuffling) that differs from the preparation used
    for cudnn and cutlass backends.

    Logic for which comes first:
    - If cuda version is 12 - use cutlass.
    - If cuda version is 13 and cudnn version is less than 9.15 - use cutlass.
    - If cuda version is 13 and cudnn version is 9.15 or greater - use cudnn.

    r  ile )r7  r   )r   r7  c                     g | ]}|v |	S rL   rL   )r.   r7  r   s     r0   
<listcomp>z*_heuristic_func_mm_fp4.<locals>.<listcomp>	  s$    DDD!Q2C-C-CA-C-C-Cr2   )r)   r?   r  r7  r  )r   rZ   r[   rL  rM  rN  r   r^   r  r  r   r  
cuda_majorcandidate_backendss   `             r0   _heuristic_func_mm_fp4r  	  si    4 "##)J  2:++0E0G0G50P0P1 2 EDDD)DDDDr2   c                     | |z   dz
  |z  |z  S r   rL   )r   rn   s     r0   _pad_upr  
  s    UQY1!!r2   c                 :    t          | d         d         d          S )Nr      r  r  s    r0   r  r  
  s    76!9Q<33 r2   r  c                     | d         d         S rK   rL   r  s    r0   r  r  
      6!9Q< r2   c                 :    t          | d         d         d          S )Nr   r   r  r  s    r0   r  r  +
  s    76!9Q<55 r2   c                     | d         d         S rK   rL   r  s    r0   r  r  0
  r  r2   r7  r  r   c           
         |4t          j        | j        d         |j        d         f| j        |          }t	          dt
          | j                  }|	dk    rt          j        }n|	g}t          | j                  \  }d fdfd	d
fd|D             }t          j
                    }rt          nt          }| ||||||||
|g
}|                    d|||          \  }} |||           |S )a  MM FP4

    Parameters
    ----------
    a: torch.Tensor
        Input tensor, shape (m, k), fp4 e2m1fn_x2 or uint8.

    b: torch.Tensor
        Mat2 tensor, shape (k, n), should be column major, fp4 e2m1fn_x2 or uint8.

    a_descale: torch.Tensor
        Block scale tensor for A, shape (m, k // block_size), float8_e4m3fn or uint8.

    b_descale: torch.Tensor
        Block scale tensor for B, shape (k, n // block_size), float8_e4m3fn or uint8.

    alpha: Optional[torch.Tensor]
        Global scale tensor, float scalar.

    out_dtype: torch.dtype
        Output dtype, bf16 or fp16. When ``backend="trtllm"``, only ``bf16`` is supported.

    out: Optional[torch.Tensor]
        Out tensor, shape (m, n), bf16 or fp16, defaults to ``None``.

    block_size: int
        Block size for FP4 quantization, only 16 and 32 are supported. 16 in case of nvfp4 quantization. 32 in case of mxfp4 quantization.

    use_8x4_sf_layout: bool
        Whether to use 8x4 scale factor layout or 128x4 scale factor layout, defaults to False.

    backend: Literal["cudnn", "trtllm", "cutlass", "auto"]
        Backend to use, defaults to ``"auto"``, which automatically selects the best
        backend between ``"cudnn"`` and ``"cutlass"`` based on the current CUDA and
        cuDNN versions. The ``"trtllm"`` backend is never selected when
        ``backend="auto"`` because it requires different weight preparation.

    use_nvfp4: bool
        Whether to use nvfp4 quantization or mxfp4 quantization, defaults to ``True``.
        See the ``block_size`` parameter for related constraints.

    Notes
    -----
    When cudnn/cutlass backend is used, both a and b should quantized with nvfp4_quantize using the 128x4 scale factor layout and do_shuffle=False.
    When trtllm backend is used, b must be quantized with 128x4 layout and `do_shuffle=True`. a can be quantized with either 128x4 or 8x4 layout (controlled by `use_8x4_sf_layout`) and `do_shuffle=False`.

    Returns
    -------
    out: torch.Tensor
        Out tensor, shape (m, n), bf16 or fp16.

    Examples
    --------
    >>> import torch
    >>> from flashinfer import nvfp4_quantize, mm_fp4, SfLayout
    >>> a = torch.randn([48, 128], device="cuda", dtype=torch.bfloat16)
    >>> b = torch.randn([256, 128], device="cuda", dtype=torch.bfloat16)
    >>> a_global_sf = (448 * 6) / a.float().abs().nan_to_num().max()
    >>> b_global_sf = (448 * 6) / b.float().abs().nan_to_num().max()
    >>> a_fp4, a_sf = nvfp4_quantize(a, a_global_sf, sfLayout=SfLayout.layout_128x4, do_shuffle=False)
    >>> b_fp4, b_sf = nvfp4_quantize(b, b_global_sf, sfLayout=SfLayout.layout_128x4, do_shuffle=True)
    >>> out = mm_fp4(a_fp4, b_fp4.T, a_sf, b_sf.T, 1.0/(a_global_sf * b_global_sf), torch.bfloat16, None, backend="trtllm")
    >>> out.shape
    torch.Size([48, 256])
    Nr   r   r   mm_fp4_workspacer   c                      t                      S r,   )r  rL   r2   r0   r  zmm_fp4.<locals>.<lambda>
  s    /11 r2   c                  F    t                                                     S r,   )get_trtllm_fp4_gemm_moduletrtllm_fp4_gemm_runner)r  s   r0   r  zmm_fp4.<locals>.<lambda>
  s!    466MM
 
 r2   c                  F    t                                                     S r,   )r\  rP  )r?   s   r0   r  zmm_fp4.<locals>.<lambda>
  s    6u==UUWW r2   r  c                 0    g | ]} |                     S rL   rL   )r.   cur_backendbackend_to_runner_factorys     r0   r  zmm_fp4.<locals>.<listcomp>
  s(    TTTK5(577TTTr2   rC  r)  )rT   r   r   r;   r4   r   mm_fp4r   r:   r   r,  _MM_FP4_TUNING_CONFIG_8x4_MM_FP4_TUNING_CONFIG_128x4r-  )rZ   r[   rL  rM  rN  r   r^   r  r  r   r  r_   r   r!  r/  r1  rs  rG   r2  rQ   r  r?   s           `           @@r0   r  r  6
  so   t {kWQZ$8
 
 
 &2AH 
 &09 &ah//HE1 21
 
 
 
 XWWW! ! UTTT8TTTG MOOE &7W!!<W 
 	
	F %%	 NFF F&((((Jr2   )Y   Z   r   r   r  r  r6  A_scaleB_scale)r7  r6  r   r   c                 "    t                       dS r   r  r   r   r  r  r   r^   r   s          r0   _cudnn_bmm_fp8_requirementr  
       4r2   c                     dS r   rL   r  s          r0   _cublas_bmm_fp8_requirementr  
  s	     4r2   c                 x    | j         t          j        k    s|j         t          j        k    rt          d          dS )Nz6e5m2 is not supported for bmm_fp8 with cutlass backendT)r   rT   rJ  r   r  s          r0   _cutlass_bmm_fp8_requirementr  
  s7     	w%###qw%2C'C'CQRRR4r2   c                 $    t          |           dS r   )r  r  s          r0   _check_bmm_fp8_problem_sizer  
  s     u%%%4r2   c                    |j         t          j        k    p|j         t          j        k    }t          |j        g d          }	t          |j        ddg          }
g }d| v r1|s/|	r|                    d           n|
r|                    d           d| v r|                    d           t          rd| v r|                    d           |S )	N)rj  rk  110r  121r   r4  r5  r6  r7  )r   rT   rJ  rB   r;   r   r  )r   r   r   r  r  r   r^   r   is_e5m2is_sm_supportedis_sm120_supportedr   s               r0   _heuristic_func_bmm_fp8r    s     g**Jag9J.JG'2G2G2GHHO*18eU^DD %%%g% 	7%%o6666 	7%%o666$$$!!(+++ +7&777!!'***r2   )r7  r6  r   c           
      t   |@t          j        | j        d         | j        d         |j        d         f| j        |          }t	          dt
          | j                  }|dk    rt          j        }n2|dk    rt          dg| ||||||          }n|d	k    rt          rd	g}n|g}t          | ||||||           |S )
ax  BMM FP8

    Parameters
    ----------
    A: torch.Tensor
        Input tensor, shape (b, m, k), fp8 e4m3 or fp8 e5m2.

    B: torch.Tensor
        Mat2 tensor, shape (b, k, n), should be column major, fp8 e4m3 or fp8 e5m2.

    A_scale: torch.Tensor
        Scale tensor for A, float.

    B_scale: torch.Tensor
        Scale tensor for B, float.

    dtype: torch.dtype
        out dtype, bf16 or fp16.

    out: Optional[torch.Tensor]
        Out tensor, shape (b, m, n), bf16 or fp16, defaults to ``None``.

    backend: Literal["cudnn", "cublas", "cutlass", "auto"]
        The backend to use for the operation. Defaults to ``"cublas"``.
        ``"auto"`` allows selecting the best tactic from all available backends when autotune is enabled.

    Returns
    -------
    out: torch.Tensor
        Out tensor, shape (b, m, n), bf16 or fp16.

    Examples
    --------
    >>> import torch
    >>> import torch.nn.functional as F
    >>> import flashinfer
    >>> def to_float8(x, dtype=torch.float8_e4m3fn):
    ...     finfo = torch.finfo(dtype)
    ...     min_val, max_val = x.aminmax()
    ...     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
    ...     scale = finfo.max / amax
    ...     x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
    ...     return x_scl_sat.to(dtype), scale.float().reciprocal()
    >>>
    >>> input = torch.randn([16, 48, 64], device="cuda", dtype=torch.bfloat16)
    >>> input_fp8, input_inv_s = to_float8(input, dtype=torch.float8_e4m3fn)
    >>> # column major weight
    >>> weight = torch.randn([16, 80, 64], device="cuda", dtype=torch.bfloat16).transpose(-2, -1)
    >>> weight_fp8, weight_inv_s = to_float8(weight, dtype=torch.float8_e4m3fn)
    >>> out = flashinfer.bmm_fp8(input_fp8, weight_fp8, input_inv_s, weight_inv_s, torch.bfloat16)
    >>> out.shape
    torch.Size([16, 48, 80])
    >>> out.dtype
    torch.bfloat16
    Nr   r   r   r   bmm_fp8_workspacer   r   r7  )rT   r   r   r;   r4   r   rW   r   r  r  r;  )	r   r   r  r  r   r^   r   r_   r   s	            r0   rW   rW   !  s    V {kWQZQWQZ08
 
 
 &3QX  &1	I		*KAwW
 
 
G			991a'30@(KKKJr2   )r   r   r  r  r   r   r   r   scale_major_moder   Kmma_smscale_granularity_mnk)r   r  c
                 (    |t          d          dS )Nz'scale_major_mode is required in CUTLASSTr  
rZ   r[   r5  r6  r  r  r  r^   r   r   s
             r0   *_cutlass_gemm_fp8_nt_groupwise_requirementr    s     BCCC4r2   c
                 p    |dk    rt          d          | j        d         dk     rt          d          dS )Nr  z5scale_granularity_mnk must be (1, 128, 128) in TRTLLMr      z#a.shape[1] must be >= 256 in TRTLLMT)r   r   r  s
             r0   )_trtllm_gemm_fp8_nt_groupwise_requirementr    sC     --PQQQwqzC>???4r2   c
                 L   | j         dk    s|j         dk    rt          d| j         d|j                   | j        d         |j        d         k    r+t          d| j        d          d|j        d                    ||pt          j        }n|j        }t          |           dS )Nr   zShape mismatch. a.shape = z, b.shape = r   Shape mismatch. a.shape[1] = z, b.shape[1] = T)r  r   r   rT   r   r   r  r  s
             r0   )_check_gemm_fp8_nt_groupwise_problem_sizer    s     	v{{afkkTagTT17TTUUUwqzQWQZSAGAJSSqwqzSS
 
 	
 {/		I	y)))4r2   )r   c
                 r   t          dt          | j                  }
||pt          j        }n|j        }|3t          j        | j        d         |j        d         | j        |          }|	dk    rt          | j                  st          | j                  r" t                      j        |
| ||||g||R   n~t          | j                  r$ t                      j        |
| ||||g|||R   nFt          d| j                   |	dk    r)t                                          |
| |||d|dd		  	         |S )
a	  Performs matrix multiplication with FP8 data types using groupwise scaling.

    This function implements a GEMM operation that allows for fine-grained control over
    scale granularity across different dimensions. Currently only supported on NVIDIA
    Blackwell architecture.

    Parameters
    ----------
    a: torch.Tensor
        Row-major input tensor shape (m, k), fp8 e4m3 or fp8 e5m2.

    b: torch.Tensor
        Column-major input tensor shape (n, k), fp8 e4m3 or fp8 e5m2.

    a_scale: torch.Tensor
        if the backend is ``cutlass``:
            Column-major scale tensor for a, shape ``(m, k // block_size)`` if scale_major_mode is ``K``
            or shape ``(k // block_size, m)`` if scale_major_mode is ``MN``
        if the backend is ``trtllm``:
            scale_major_mode should be None, the scale tensor should be (m, k // block_size),
            contiguous on the first dimension

    b_scale: torch.Tensor
        if the backend is ``cutlass``:
            Row-major scale tensor for b, shape ``(n // block_size, k // block_size)`` if scale_major_k is ``K``
            or shape ``(k // block_size, n // block_size)`` if scale_major_mode is ``MN``
        if the backend is ``trtllm``:
            scale_major_mode should be None, the scale tensor should be (k // block_size, n // block_size),
            contiguous on the first dimension

    scale_granularity_mnk: Tuple[int, int, int]
        The granularity of the scale tensor, (m_granularity, n_granularity, k_granularity).

    scale_major_mode: Literal["MN", "K"]
        The layout mode of scale tensor, `MN` for MN-major scale with shape of
        ``(k // block_size, *)`` and `K` for K-major scale with shape of
        ``(*, k // block_size)``

    mma_sm: int
        How many SMs to use for the MMA operation, must be 1 or 2.
        2 is faster when number of rows (M) per group is large (>= 256).

    out: Optional[torch.Tensor]
        Output tensor, shape (m, n). If not specified, we will create an output tensor explicitly.

    out_dtype: Optional[torch.dtype]
        If out is not specified, we will create an output tensor with this dtype.
        Defaults to ``torch.bfloat16``.

    backend: Literal["cutlass", "trtllm"]
        The backend to use for the operation. Defaults to ``"cutlass"``.

    Returns
    -------
    out: torch.Tensor
        Output tensor, shape (m, n).

    Notes
    -----
    The ``m`` should be padded to a multiple of 4 before calling this function, to accommodate the kernel's requirement.
    gemm_fp8_nt_groupwise_workspaceNr   r   r   z!Unsupported device for FP8 GEMM: r  FrP   )r4   r   r;   rT   r   r   r   r   r   r   r   r   r   r   r   r   trtllm_gemm)rZ   r[   r5  r6  r  r  r  r^   r   r   r_   s              r0   r   r     s   d &)+A18  {/		I	 {kGAJGAJ8	
 
 
 )qx(( 	M,?,I,I 	M9!##9 	 '	 !	 	 	 	 	 !** 	M9!##9 
 '
 !
 
 
 
 
 
 KKKLLL	H		  ,,
	
 
	
 
	
 Jr2   c                      t                      } |                                 t          |                                            ddt          ffd}t          |          S )NTr  c                 D     G fddt                     } ||           S )Nc                       e Zd Zddef fdZdeej                 dedee	         f fdZ
	 	 ddeej                 d
e	defdZdS )Wget_trtllm_fp4_gemm_module.<locals>.trtllm_fp4_gemm_runner.<locals>.TrtllmFp4GemmRunnerTr  c                 .    j         | _        || _        d S r,   )r  rD  _use_8x4_sf_layout)rN   r  r   s     r0   rE  z`get_trtllm_fp4_gemm_module.<locals>.trtllm_fp4_gemm_runner.<locals>.TrtllmFp4GemmRunner.__init__g  s    (*%*;'''r2   rG   rH   rI   c           
      $   d}d}|                                 |         }|                                 |         }|d         }|d         }|d         dz  }	|\
  }}}
}}}}}}}d}d}t                              |||	||| j                            S )Nr   r   r   )get_opt_shapesr  trtllm_gemm_tacticsr  )rN   rG   rH   rr  b_tensor_indexrZ   r[   rk  rl  krL  rM  rN  r!  r^   r_   	type_e2m1	type_bf16r   s                     r0   rO   ziget_trtllm_fp4_gemm_module.<locals>.trtllm_fp4_gemm_runner.<locals>.TrtllmFp4GemmRunner.get_valid_tacticsk  s    
 "#!"**,,^<**,,^<aDaDaD1H $		**1aIt7N   r2   rP   FrQ   rR   c                 x    |\
  }}}}}	}
}}
}
}|                      |||j        ||j        |	|| j        |	  	         |S r,   )rD  rK  r  )rN   rG   rQ   rR   rX   rZ   r[   rL  rM  rN  r!  r^   r_   s                r0   ra   z_get_trtllm_fp4_gemm_module.<locals>.trtllm_fp4_gemm_runner.<locals>.TrtllmFp4GemmRunner.forward  sk    $ $%%$CK+
 
 
 
r2   NTrb   )rd   re   rf   ri   rE  r   rT   rg   r   rh   rO   ra   )r   s   r0   TrtllmFp4GemmRunnerr  f  s        < <$ < < < < < <U\* - c	     H !',	 U\*  !%	     r2   r  rl   )r  r  r   s     r0   r  z:get_trtllm_fp4_gemm_module.<locals>.trtllm_fp4_gemm_runnere  sV    D	 D	 D	 D	 D	 D	 D	- D	 D	 D	L #"#4555r2   )r  r  )r%   r~   r3   r   ri   r   )r   r  r   s     @r0   r  r  _  s    
$
&
&C					Bs++--...G6 G6$ G6 G6 G6 G6 G6 G6T 5   r2   r   c                 j    t          | |||||d||d
  
         t          | |||||d||d
  
         dS )Nr   r   r   r   )r  r^   r   r   T)r  r  rZ   r[   r5  r6  r  r  r^   r   s           r0   +_check_gemm_fp8_nt_blockscaled_problem_sizer    sw     .		-    /		-    4r2   c                 2    t          | |||d||||	  	        S )zPerforms matrix multiplication with FP8 data types using block-scaled scaling.

    Block-scaled scaling is a special case of groupwise scaling where the scale granularity
    is (128, 128, 128).
    r  )r  r  r  r^   r   )r   r  s           r0   gemm_fp8_nt_blockscaledr    s6    ( !		-)
 
 
 
r2   m_indptrc
                    | j         t          j        t          j        fvrt	          d| j                    |j         t          j        t          j        fvrt	          d|j                    |j         t          j        fvrt	          d|j                    |j         t          j        fvrt	          d|j                    |j         t          j        fvrt	          d|j                    |dvrt	          d|           |dvrt	          d	|           |j        d
         }
|j        d         }||	t          j        }	nm|	|j         }	|j        | j        d         |
fk    r't	          d|j         d| j        d         |
f           |j         |	k    rt	          d|j          d|	           t          |	           | j        d
         |k    r t	          d| j        d
          d|           |
dz  dk    rt	          d|
           |dz  dk    rt	          d|           |j        d         d
z
  }t          | j                  st          | j                  r|d
k    rt          d          dS )Nz#a must be a float8 tensor, but got z#b must be a float8 tensor, but got z*a_scale must be a float32 tensor, but got z*b_scale must be a float32 tensor, but got )m_indptr must be a int32 tensor, but got r  z5scale_major_mode must be either 'MN' or 'K', but got r   r   &mma_sm must be either 1 or 2, but got r   r   r   zShape mismatch. out.shape = z, (a.shape[0], n) = zdtype mismatch. out.dtype = z, out_dtype = r  z, k = r  z#n must be a multiple of 8, but got rn  z$k must be a multiple of 16, but got zRgroup_gemm_fp8_nt_groupwise has correctness issues for num_groups > 1 on SM120/121T)r   rT   rJ  rJ  r   float32r  r   r   r  r   r;   r   r  )rZ   r[   r5  r6  r  r  r  r  r^   r   rl  r  
num_groupss                r0   /_check_group_gemm_fp8_nt_groupwise_problem_sizer    s    	wu*E,=>>>HqwHHIIIwu*E,=>>>HqwHHIII}U]O++UgmUUVVV}U]O++UgmUUVVV~ek]**UX^UUVVV{**VDTVV
 
 	
 VJ&JJKKK 	

A	
A
{I	I9Q''_sy__qwWXz[\o__   9	!!SsySS	SS   y)))wqzQNNN1NNOOO1uzzBqBBCCC2v{{CCCDDD"Q&J18$$ (;AH(E(E >>d   4r2   c
                 F   t          dt          | j                  }
t          dt          | j                  }||	t          j        }	n	|	|j        }	|j        d         }|j        d         }| j        d         |f}|t          j        ||	| j                  }t          | j                  st          | j                  r& t                      j        |
|| |||||||g
||R   n;t          | j                  r' t                      j        |
|| |||||||g
|||R   |S )a  Perform group GEMM with FP8 data types using groupwise scaling. Currently only supported on NVIDIA
    Blackwell architecture.

    Parameters
    ----------
    a: torch.Tensor
        Row-major input tensor shape ``(cum_m, k)``, data type is ``torch.float8_e4m3fn`` or ``torch.float8_e5m2``.
        ``cum_m`` is the cumulative sum of the segment lengths.

    b: torch.Tensor
        Column-major input tensor shape ``(batch_size, n, k)``, data type is ``torch.float8_e4m3fn`` or ``torch.float8_e5m2``.

    a_scale: torch.Tensor
        Column-major scale tensor for a, shape ``(cum_m, k // block_size)`` if scale_major_mode is ``K``
        or shape ``(k // block_size, cum_m)`` if scale_major_mode is ``MN``, data type is ``torch.float32``.

    b_scale: torch.Tensor
        Row-major scale tensor for b, shape ``(batch_size, n // block_size, k // block_size)`` if scale_major_mode is ``K``
        shape ``(batch_size, k // block_size, n // block_size)`` if scale_major_mode is ``MN``, data type is ``torch.float32``.

    m_indptr: torch.Tensor
        The indptr of the segment lengths, shape ``(batch_size + 1,)``, data type is ``torch.int32``.
        Element element in ``m_indptr`` must be a multiple of 4.

    scale_granularity_mnk: Tuple[int, int, int]
        The granularity of the scale tensor, (m_granularity, n_granularity, k_granularity).

    scale_major_mode: Literal["MN", "K"]
        The layout mode of scale tensor, `MN` for MN-major scale with shape of
        ``(k // block_size, *)`` and `K` for K-major scale with shape of
        ``(*, k // block_size)``

    mma_sm: int
        How many SMs to use for the MMA operation, must be 1 or 2.
        2 is faster when number of rows (M) per group is large (>= 256).

    out: Optional[torch.Tensor]
        The output tensor, shape ``(cum_m, n)``. If not specified, we will create an output tensor explicitly.

    out_dtype: Optional[torch.dtype]
        The data type of the output tensor, must be ``torch.bfloat16`` or ``torch.float16``.

    Returns
    -------
    out: torch.Tensor
        The output tensor, shape ``(cum_m, n)``.

    Notes
    -----
    Each value in ``m_indptr`` should be padded to a multiple of 4 before calling this function,
    to accommodate the kernel's requirement.
    )group_gemm_fp8_nt_groupwise_int_workspace+group_gemm_fp8_nt_groupwise_float_workspaceNr   r   r   r   )r4   r   r;   rT   r   r   r   r   r   r   r   group_gemm_fp8_nt_groupwiser   r   )rZ   r[   r5  r6  r  r  r  r  r^   r   rw  r  rl  r  	out_shapes                  r0   r  r  B  s   J *35KQX  ,57Mqx  {I	I	
A	
AQI
{k)9QXFFF18$$ 
(;AH(E(E 
;; "	
 #	
 	
 	
 	
 	
 	
 
QX	&	& 
;; "	
 #	
 	
 	
 	
 	
 	
 Jr2   r   tile_mtile_ntile_kswap_abc                    | j         t          j        t          j        fvrt	          d| j                    |j         t          j        k    rt	          d|j                    |j         t          j        k    rt	          d|j                    |j         t          j        k    rt	          d|j                    |j         t          j        k    rt	          d|j                    |dvrt	          d|           |dvrt	          d	|           |d
vrt	          d|           |dvrt	          d|           |	dvrt	          d|	           |
|t          j        }n	||
j         }|t          j        t          j        fvrt	          d|           |j	        d         dz
  }|j	        d         |k    r t	          d|j	        d          d|           |j	        d         }|j	        d         dz  }| j	        d         |k    r t	          d| j	        d          d|           d}d}||z  dk    rt	          d| d|           ||z  dk    rt	          d| d|           | j	        d         |f}|
J|
j	        |k    rt	          d| d|
j	                   |
j         |k    rt	          d | d|
j                    d!S )"Nz9a must be a float8_e4m3fn or float8_e5m2 tensor, but got z"b must be a uint8 tensor, but got z(a_scale must be a uint8 tensor, but got z(b_scale must be a uint8 tensor, but got r  r  r  )r   ztile_m must be 128, but got )@   r      r  z3tile_n must be one of [64, 128, 192, 256], but got )r   r  z*tile_k must be either 128 or 256, but got )TFz)swap_ab must be a boolean value, but got zBout_dtype must be either torch.bfloat16 or torch.float16, but got r   r   zMb.shape[0] must equal num_groups (m_indptr.shape[0] - 1), but got b.shape[0]=z, num_groups=r   z,a.shape[1] must equal k, but got a.shape[1]=z, k=r  r   zn must be a multiple of z, but got n=zk must be a multiple of z, but got k=zout.shape must be z
, but got zout.dtype must be T)
r   rT   rJ  rJ  r   rI  r  r   rq  r   )rZ   r[   r5  r6  r  r  r  r  r	  r
  r^   r   r   rl  r  align_nalign_kr  s                     r0   7_check_group_gemm_mxfp8_mxfp4_nt_groupwise_problem_sizer    st    	wu*E,=>>>QQQ
 
 	
 	w%+GagGGHHH}##SGMSSTTT}##SGMSSTTT~$$UX^UUVVVVJ&JJKKKU@@@AAA(((WvWWXXXZNfNNOOOm##NWNNOOO {I	I777\QZ\\
 
 	
 "Q&JwqzZ B\]\cde\f  B  Bu  B  B
 
 	
 	

A	
QA 	wqzQN171:NN1NN
 
 	
 GG7{aLGLLLLMMM7{aLGLLLLMMMQI
9	!!R)RRsyRRSSS9	!!R)RRsyRRSSS4r2   c                    t          dt          | j                  }t          dt          | j                  }|
|t          j        }n	||
j        }|j        d         }|j        d         dz  }| j        d         |f}|
t          j        ||| j                  }
t                      	                    ||| ||||
||||||||	           |
S )a  Perform group GEMM with MXFP4 data types using groupwise scaling. Currently only supported on NVIDIA
    Blackwell architecture.

    Parameters
    ----------
    a: torch.Tensor
        Row-major input tensor, shape ``(cum_m, k)``, data type is ``torch.float8_e4m3fn`` or ``torch.float8_e5m2``.
        ``cum_m`` is the cumulative sum of the segment lengths.

    b: torch.Tensor
        Column-major input tensor, shape ``(batch_size, n, k // 2)``, data type is ``torch.uint8``.

    a_scale: torch.Tensor
        Column-major scale tensor for a, shape ``(cum_m_padded, k // 32)``, data type is ``torch.uint8``.

    b_scale: torch.Tensor
        Row-major scale tensor for b, shape ``(batch_size, n_padded, k // 32)``, data type is ``torch.uint8``.

    m_indptr: torch.Tensor
        The indptr of the segment lengths, shape ``(batch_size + 1,)``, data type is ``torch.int32``.
        Element element in ``m_indptr`` must be a multiple of 4.

    mma_sm: int
        How many SMs to use for the MMA operation, must be 1 or 2.
        2 is faster when number of rows (M) per group is large (>= 256).

    tile_m: int
        The tile size for the M dimension, must be 128.

    tile_n: int
        The tile size for the N dimension, must be 64, 128, 192, or 256.

    tile_k: int
        The tile size for the K dimension, must be 128 or 256.

    swap_ab: bool
        Whether to swap the A and B tensors.

    out: Optional[torch.Tensor]
        The output tensor, shape ``(cum_m, n)``. If not specified, we will create an output tensor explicitly.

    out_dtype: Optional[torch.dtype]
        The data type of the output tensor, must be ``torch.bfloat16`` or ``torch.float16``.

    Returns
    -------
    out: torch.Tensor
        The output tensor, shape ``(cum_m, n)``.

    Notes
    -----
    Each value in ``m_indptr`` should be padded to a multiple of 4 before calling this function,
    to accommodate the kernel's requirement.
    +group_gemm_mxfp4_nt_groupwise_int_workspace-group_gemm_mxfp4_nt_groupwise_float_workspaceNr   r   r   r   )
r4   r   r;   rT   r   r   r   r   r   group_gemm_mxfp4_nt_groupwise)rZ   r[   r5  r6  r  r  r  r  r	  r
  r^   r   rw  r  rl  r  r  s                    r0   #group_gemm_mxfp8_mxfp4_nt_groupwiser    s    R *57Mqx  ,7	  {I	I	
A	
QAQI
{k)9QXFFF99				  " Jr2   c                    ddl m} | j        d         dz
  }| dd          | d d         z
  }|dz   |dz   dz  z
  }t          j        t          j        d|j        |j        	          |f          }|                    d|j        
          }t          j        | d         f| j        | j                  }t          j        | d         f| j        | j                  } ||f         | |||           ||fS )Nr   )compute_padding_mappingr   r   rP   r  r  )r   r   )r   r   r   )	r  r  r   rT   catr   r;   r   cumsum)r  r  r   rk  padded_m_indptrm_rankpadded_m_ranks          r0   pad_indptr_to_multiple_of_4r    s    655555"Q&Jx}$A	AQ!AiT!(!'!R!R!RTU VWWO%,,/:O,PPO[(2,xWWWFK	"x~ho  M +ZM*/6=   M))r2   c                  "    t                      } | S r,   r'   rj   s    r0   get_deepgemm_sm100_moduler    s    &((FMr2   	m_indicesc                     ddl m} |A|pt          j        }t          j        | j        d         |j        d         || j                  } || |f||f|||          S )Nr   )4_check_group_deepgemm_fp8_nt_contiguous_problem_sizer   r   )flashinfer.deep_gemmr"  rT   r   r   r   r;   )	rZ   r[   r5  r6  r   r  r^   r   r"  s	            r0   3_check_group_deepgemm_fp8_nt_groupwise_problem_sizer$    s          {/	k!'!*agaj	!(SSS??	
Gq'lC4I  r2   c                     ddl m} |A|pt          j        }t          j        | j        d         |j        d         || j                  } || |f||f|||           |S )a  Perform grouped matrix multiplication with FP8 data types using DeepGEMM backend.

    This function performs a grouped GEMM operation where each group in tensor `b` is multiplied
    with the corresponding rows in tensor `a`. The grouping is determined by the `m_indices` tensor,
    which specifies which group each row belongs to. This is particularly useful for scenarios
    like mixture of experts (MoE) where different tokens are routed to different experts.

    The operation can be conceptualized as:

    >>> for i in range(num_groups):
    >>>    row_slice = slice(i * m_per_group, (i + 1) * m_per_group)
    >>>    output[row_slice] = a[row_slice] @ b[i].T

    Currently only supported on NVIDIA Blackwell (SM100) architecture.

    Parameters
    ----------
    a : torch.Tensor
        Input tensor A of shape ``(m, k)`` with FP8 data type (``torch.float8_e4m3fn``).
        This tensor contains all rows that will be multiplied with different groups in `b`.

    b : torch.Tensor
        Input tensor B of shape ``(batch_size, n, k)`` with FP8 data type (``torch.float8_e4m3fn``).
        Each slice ``b[i]`` represents a different group/expert that will be multiplied with
        the corresponding rows in `a`.

    a_scale : torch.Tensor
        Scaling factors for tensor `a` of shape ``(m, k // block_size)`` with ``torch.float32`` dtype.
        These are typically generated from per-token quantization of the original float32 tensor.

    b_scale : torch.Tensor
        Scaling factors for tensor `b` of shape ``(batch_size, n // block_size, k // block_size)``
        with ``torch.float32`` dtype. These are typically generated from per-block quantization
        of the original float32 tensor for each group.

    m_indices : torch.Tensor
        Group assignment tensor of shape ``(m,)`` with ``torch.int32`` dtype. Each element
        specifies which group (index into `b`) the corresponding row in `a` belongs to.
        For example, if ``m_indices[i] = j``, then row ``i`` in `a` will be multiplied with
        group ``j`` in `b`.

    scale_granularity_mnk : Tuple[int, int, int], optional
        The granularity of the scaling factors as ``(m_granularity, n_granularity, k_granularity)``.
        Default is ``(1, 128, 128)`` which means per-token scaling for `a` and 128x128 block
        scaling for `b`.

    out : Optional[torch.Tensor], optional
        Pre-allocated output tensor of shape ``(m, n)``. If not provided, a new tensor will be
        created.

    out_dtype : Optional[torch.dtype], optional
        Data type of the output tensor. If `out` is provided, this parameter is ignored.
        Default is ``torch.bfloat16``.

    Returns
    -------
    torch.Tensor
        Output tensor of shape ``(m, n)`` containing the results of the grouped matrix multiplication.

    Examples
    --------
    >>> import torch
    >>> from flashinfer.gemm import group_deepgemm_fp8_nt_groupwise
    >>> from flashinfer.utils import per_token_cast_to_fp8, per_block_cast_to_fp8
    >>>
    >>> # Setup: 2 groups, 128 tokens per group, 4096 hidden size, 2048 expert size
    >>> m_per_group, n, k = 128, 2048, 4096
    >>> group_size = 2
    >>> m = m_per_group * group_size
    >>>
    >>> # Create float32 inputs
    >>> a_f32 = torch.randn(m, k, device="cuda", dtype=torch.float32)
    >>> b_f32 = torch.randn(group_size, n, k, device="cuda", dtype=torch.float32)
    >>>
    >>> # Quantize to FP8 with appropriate scaling
    >>> a_fp8, a_scale = per_token_cast_to_fp8(a_f32)
    >>> b_fp8 = torch.empty_like(b_f32, dtype=torch.float8_e4m3fn)
    >>> b_scale = torch.empty((group_size, n // 128, k // 128), device="cuda", dtype=torch.float32)
    >>> for i in range(group_size):
    ...     b_fp8[i], b_scale[i] = per_block_cast_to_fp8(b_f32[i])
    >>>
    >>> # Create group assignment
    >>> m_indices = torch.empty(m, device="cuda", dtype=torch.int32)
    >>> for i in range(group_size):
    ...     row_slice = slice(i * m_per_group, (i + 1) * m_per_group)
    ...     m_indices[row_slice] = i
    >>>
    >>> # Perform grouped GEMM
    >>> result = group_deepgemm_fp8_nt_groupwise(
    ...     a_fp8, b_fp8, a_scale, b_scale, m_indices, out_dtype=torch.bfloat16
    ... )
    >>> print(result.shape)  # torch.Size([256, 2048])

    Notes
    -----
    - This function requires NVIDIA Blackwell (SM100) architecture
    - The scaling factors should be generated using appropriate quantization functions
      like ``per_token_cast_to_fp8`` for `a` and ``per_block_cast_to_fp8`` for `b`
    - The function internally uses the DeepGEMM backend for optimized FP8 computation
    - All input tensors must be on the same CUDA device
    - The block size for scaling is determined by the ``scale_granularity_mnk`` parameter
    r   ) m_grouped_fp8_gemm_nt_contiguousNr   r   )r#  r&  rT   r   r   r   r;   )	rZ   r[   r5  r6  r   r  r^   r   r&  s	            r0   group_deepgemm_fp8_nt_groupwiser'    s    j FEEEEE
{/	k!'!*agaj	!(SSS$$	
Gq'lC4I   Jr2   masked_m
expected_mc	                     ddl m}	 |M|pt          j        }t          j        | j        d         | j        d         |j        d         || j                  } |	| |f||f||||          S )Nr   )0_check_m_grouped_fp8_gemm_nt_masked_problem_sizer   r   )r#  r+  rT   r   r   r   r;   )
rZ   r[   r5  r6  r(  r)  r  r^   r   r+  s
             r0   &_check_batch_deepgemm_fp8_nt_groupwiser,  A  s     VUUUUU
{/	kGAJ
AGAJi
 
 
 <;	
Gq'lC:?T  r2   c	                     ddl m}	 |M|pt          j        }t          j        | j        d         | j        d         |j        d         || j                  } |	| |f||f||||           |S )a  Perform batch matrix multiplication with FP8 data types using DeepGEMM backend.

    This function performs a batch GEMM operation where each group in tensor `b` is multiplied
    with the corresponding group of rows in tensor `a`. The results of each group is masked by
    the `masked_m` tensor, which specifies which group each row belongs to. This is particularly
    useful for scenarios like mixture of experts (MoE) where different tokens are routed to different experts.

    The operation can be conceptualized as:

    >>> for i in range(num_groups):
    >>>     output[i] = a[i][:masked_m[i]] @ b[i][:masked_m[i]].T

    Currently only supported on NVIDIA Blackwell (SM100) architecture.

    Parameters
    ----------
    a : torch.Tensor
        Input tensor A of shape ``(batch_size, m, k)`` with FP8 data type (``torch.float8_e4m3fn``).
        Each slice ``a[i]`` represents a group of rows that will be multiplied with
        the corresponding group/expert in `b`.

    b : torch.Tensor
        Input tensor B of shape ``(batch_size, n, k)`` with FP8 data type (``torch.float8_e4m3fn``).
        Each slice ``b[i]`` represents a different group/expert that will be multiplied with
        the corresponding rows in `a`.

    a_scale : torch.Tensor
        Scaling factors for tensor `a` of shape ``(batch_size, m, k // block_size)`` with ``torch.float32`` dtype.
        These are typically generated from per-token quantization of the original float32 tensor.

    b_scale : torch.Tensor
        Scaling factors for tensor `b` of shape ``(batch_size, n // block_size, k // block_size)``
        with ``torch.float32`` dtype. These are typically generated from per-block quantization
        of the original float32 tensor for each group.

    masked_m : torch.Tensor
        Masking tensor of shape ``(batch_size,)`` with ``torch.int32`` dtype. Each element
        specifies the effective rows to be multiplied in each group.
        For example, if ``masked_m[i] = j``, then first ``j`` rows in `a[i]` will be multiplied with
        group ``i`` in `b`.

    expected_m : int
        A value hint (which is a value on CPU) for the M expectation of each batch, correctly setting
        this value may lead to better performance.

    scale_granularity_mnk : Tuple[int, int, int], optional
        The granularity of the scaling factors as ``(m_granularity, n_granularity, k_granularity)``.
        Default is ``(1, 128, 128)`` which means per-token scaling for `a` and 128x128 block
        scaling for `b`.

    out : Optional[torch.Tensor], optional
        Pre-allocated output tensor of shape ``(batch_size, m, n)``. If not provided, a new tensor will be
        created.

    out_dtype : Optional[torch.dtype], optional
        Data type of the output tensor. If `out` is provided, this parameter is ignored.
        Default is ``torch.bfloat16``.

    Returns
    -------
    torch.Tensor
        Output tensor of shape ``(batch_size, m, n)`` containing the results of the batch matrix multiplication.

    Examples
    --------
    >>> import torch
    >>> from flashinfer.gemm import batch_deepgemm_fp8_nt_groupwise
    >>> from flashinfer.utils import per_token_cast_to_fp8, per_block_cast_to_fp8
    >>>
    >>> # Setup: 2 groups, 128 tokens per group, 4096 hidden size, 2048 expert size
    >>> m, n, k = 128, 2048, 4096
    >>> group_size = 2
    >>>
    >>> # Create float32 inputs
    >>> a = torch.rand((group_size, m, k), device="cuda", dtype=torch.float32)
    >>> b = torch.rand((group_size, n, k), device="cuda", dtype=torch.float32)
    >>> masked_m = torch.randint(0, m, (group_size,), device="cuda", dtype=torch.int32)
    >>> a_fp8 = torch.empty_like(a, device="cuda", dtype=torch.float8_e4m3fn)
    >>> a_scale = torch.empty((group_size, m, k // 128), device="cuda", dtype=torch.float32)
    >>> b_fp8 = torch.empty_like(b, device="cuda", dtype=torch.float8_e4m3fn)
    >>> b_scale = torch.empty(
    ...    (group_size, n // 128, k // 128), device="cuda", dtype=torch.float32
    >>> )
    >>> for i in range(group_size):
    ...    a_fp8[i], a_scale[i] = per_token_cast_to_fp8(a[i])
    ...    b_fp8[i], b_scale[i] = per_block_cast_to_fp8(b[i])
    >>>
    >>> expected_m = min(int(masked_m.float().mean()) + 1, m)
    >>>
    >>> # Perform batch GEMM
    >>> result = batch_deepgemm_fp8_nt_groupwise(
    ...     a_fp8, b_fp8, a_scale, b_scale, masked_m, expected_m, out_dtype=torch.bfloat16
    ... )
    >>> print(result.shape)  # torch.Size([2, 128, 2048])

    Notes
    -----
    - This function requires NVIDIA Blackwell (SM100) architecture
    - The scaling factors should be generated using appropriate quantization functions
      like ``per_token_cast_to_fp8`` for `a` and ``per_block_cast_to_fp8`` for `b`
    - The function internally uses the DeepGEMM backend for optimized FP8 computation
    - All input tensors must be on the same CUDA device
    - The block size for scaling is determined by the ``scale_granularity_mnk`` parameter
    r   )m_grouped_fp8_gemm_nt_maskedNr   r   )r#  r.  rT   r   r   r   r;   )
rZ   r[   r5  r6  r(  r)  r  r^   r   r.  s
             r0   batch_deepgemm_fp8_nt_groupwiser/  Z  s    p BAAAAA
{/	kGAJ
AGAJi
 
 
 ! 	
Gq'lC:?T   Jr2   c                      t                                                      } ddlm} t	          |j        dz  dz            }|                     |g           |                                 S )z4Get the FP8 block scale GEMM runner module for SM90.r   )envnv_internaltensorrt_llm)r*   r~   jitr1  r  FLASHINFER_CSRC_DIRset_deepgemm_jit_include_dirsinit)r`   jit_envdeepgemm_include_dirs      r0   #get_fp8_blockscale_gemm_runner_sm90r:    su     122AACCF$$$$$$#m3nD  ((*>)?@@@;;==r2   inputweightinput_scaleweight_scalec                 N   t          | j        ddg          st          d          | j        dk    rt          d| j                   |j        dk    rt          d|j                   | j        \  }}|j        \  }}	|	|k    rt          d| d|	           d	}
||
z  d
k    rt          d|
 d|           |dz  d
k    rt          d|           | j        t          j        k    }|j        t          j        k    }| j        t          j        k    }|j        t          j        k    }|r|rt          d          |rm|t          d          |j        t          j	        k    rt          d|j                   |j        | j        k    rt          d| j         d|j                   n*|st          d| j                   |t          d          |r|t          d          |||
z  f}||
z   dz
  |
z  ||
z  f}|j        |k    }|j        |k    }|s|st          d| d| d|j                   |j        t          j	        k    rt          d|j                   n*|st          d|j                   |t          d          ||j        ||fk    rt          d| d | d!|j                   |j        | j        k    rt          d"| j         d|j                   |j        t          j        t          j
        fvrt          d#|j                   |%|j        |k    rt          d$| d|j                   |j        }nW|pt          j        }|t          j        t          j
        fvrt          d#|           t          j        |||| j        %          }t                      }|                    |||          }d}|d
k    r;t          j        |t          j        | j        %          }|                    |           |                    | ||||           |S )&u;  
    Perform FP8 block-scaled GEMM with automatic swapAB optimization.
    This function automatically selects between normal and swapAB kernel based on
    the M dimension. For small M (< 32), it uses the swapAB kernel for
    better performance.

    Supported Dtype Combinations
    -----------------------------
    - **BF16 + BF16 → BF16**: Both inputs BF16, internal quantization (no scales needed)
    - **BF16 + FP8 → BF16**: BF16 input, FP8 weight
    - **FP8 + FP8 → BF16** (W8A8): Both inputs FP8 with scales required

    Parameters
    ----------
    input : torch.Tensor
        Input activation tensor of shape (M, K).
        - BF16 (torch.bfloat16) with internal quantization
    weight : torch.Tensor
        Weight tensor of shape (N, K). Can be:
        - FP8 (torch.float8_e4m3fn) with weight_scale required
        - BF16 (torch.bfloat16) for internal quantization
    input_scale : torch.Tensor, optional
    weight_scale : torch.Tensor, optional
        Scaling factors for weight. Required if weight is FP8.
    out : torch.Tensor, optional
        Output tensor of shape (M, N). If None, will be allocated.
    out_dtype : torch.dtype, optional
        Output data type. Default is torch.bfloat16.
    Returns
    -------
    torch.Tensor
        Output tensor of shape (M, N) with dtype `out_dtype`.
    Examples
    --------
    >>> import torch
    >>> from flashinfer.gemm import fp8_blockscale_gemm_sm90
    >>>
    >>> M, N, K = 16, 4096, 4096
    >>> device = "cuda"
    >>>
    >>> # BF16 inputs
    >>> input_bf16 = torch.randn(M, K, device=device, dtype=torch.bfloat16)
    >>> weight_bf16 = torch.randn(N, K, device=device, dtype=torch.bfloat16)
    >>> output = fp8_blockscale_gemm_sm90(input_bf16, weight_bf16)
    >>> print(output.shape)  # torch.Size([16, 4096])
    >>>
    >>> # Mixed: BF16 input + FP8 weight
    >>> from flashinfer.testing.utils import per_token_cast_to_fp8
    >>> input_bf16 = torch.randn(M, K, device=device, dtype=torch.bfloat16)
    >>> weight_bf16 = torch.randn(N, K, device=device, dtype=torch.bfloat16)
    >>> weight_fp8, weight_scale = per_token_cast_to_fp8(weight_bf16)
    >>> output = fp8_blockscale_gemm_sm90(input_bf16, weight_fp8, None, weight_scale)
    >>> print(output.shape)  # torch.Size([16, 4096])
    >>>
    >>> # FP8 weight with 128x128 block scales
    >>> from flashinfer.testing.utils import per_block_cast_to_fp8
    >>> weight_bf16 = torch.randn(N, K, device=device, dtype=torch.bfloat16)
    >>> weight_fp8, weight_scale = per_block_cast_to_fp8(weight_bf16)
    >>> # weight_scale has shape (N // 128, K // 128)
    >>> input_bf16 = torch.randn(M, K, device=device, dtype=torch.bfloat16)
    >>> output = fp8_blockscale_gemm_sm90(input_bf16, weight_fp8, None, weight_scale)
    >>> print(output.shape)  # torch.Size([16, 4096])
    Notes
    -----
    - This function requires NVIDIA Hopper (SM90) architecture and CUDA 12.8+
    - SwapAB kernel is automatically used when M < 32 (threshold)
    - For FP8 inputs, scaling factors must be provided
    - For BF16 inputs, quantization and scaling happen internally
    - Weight scales support two granularities:
      * Per-token (1x128 blocks): (N, K//128)
      * Per-block (128x128 blocks): (N//128, K//128)
    - Input scales only support per-token format: (M, K//128)
    - The function uses DeepGEMM backend with JIT compilation
    9090azIfp8_blockscale_gemm_sm90 is only supported on SM90 (Hopper) architecture.r   z#Input must be 2D (M, K), got shape z$Weight must be 2D (N, K), got shape z"K dimension mismatch: input has K=z, weight has K=r   r   z-K dimension must be divisible by block size (z	), got K=r  z+N dimension must be divisible by 64, got N=zJFP8 input + BF16 weight is not supported (missing kernel implementation). Nz+input_scale is required when input is FP8. z!input_scale must be float32, got z&input_scale device mismatch. Expected r   zMInput must be either FP8 (torch.float8_e4m3fn) or BF16 (torch.bfloat16), got zjinput_scale should not be provided for BF16 inputs. Use FP8 inputs if you want to provide external scales.z-weight_scale is required when weight is FP8. r   z-weight_scale shape mismatch. Expected either z (per-token, 1x128 blocks) or z" (per-block, 128x128 blocks), got z"weight_scale must be float32, got zNWeight must be either FP8 (torch.float8_e4m3fn) or BF16 (torch.bfloat16), got zmweight_scale should not be provided for BF16 weights. Use FP8 weights if you want to provide external scales.z!Output shape mismatch. Expected (z, z), got r   z:Output dtype must be torch.bfloat16 or torch.float16, got r   r   )rB   r;   r   r  r   r   rT   rJ  r   r  rq  r   r:  r,  rI  configure_workspacerun_gemm)r;  r<  r=  r>  r^   r   Mr  NK_weight
BLOCK_SIZEinput_is_fp8weight_is_fp8input_is_bf16weight_is_bf16expected_per_token_shapeexpected_per_block_shapeis_per_tokenis_per_blockr2  r2  rD  s                         r0   fp8_blockscale_gemm_sm90rP    s   h U\D%=99 
W
 
 	

 zQLu{LLMMM{aNNNOOO;DAq,KAx1}}MMM8MM
 
 	

 J:~TJTTQRTT
 
 	
 	2v{{JqJJKKK ;%"55LLE$77MK5>1M\U^3N  
 
X
 
 	

  JKKK--TARTTUUU--, , ,"), ,   .  	%{% %   "I  
  LMMM$%qJ#7 %&^a%7J$FZ#X #)-EE#)-EE 	 	I@X I I0HI I4@4FI I  
 ..V,BTVVWWW /  	&|& &   #J   9ANANNNN39NN   :%%TELTT
TT   9U^U];;;XSYXX    SY)%;%;O9OOCIOO   I		 /	U^U];;;XYXX   k!QiEEE 122F ..q!Q77NIKek%,WWW	""9---
OOE63\BBBJr2   rk  rl  r  c                     d}d}d } || |          |z  } |||          |z  } | |||          |          |z  }	|||	fS )zDCalculate block scale dimensions using indestructible block formula.r   r  c                     | |z   dz
  |z  S r   rL   )rZ   r[   s     r0   div_upz+_calculate_block_scale_dims.<locals>.div_up  s    A	ar2   rL   )
rk  rl  r  r  INDESTRUCTIBLE_128x4_BLOCK_M_NINDESTRUCTIBLE_128x4_BLOCK_KrS  block_scale_dim_mblock_scale_dim_nblock_scale_dim_ks
             r0   _calculate_block_scale_dimsrY    s     &)"#$       	q0114RR  	q0114RR  	vva$$&BCC
&	' 
 /1BBBr2   c	                    t          |           dk    rt          d|            t          |          dk    rt          d|           |t          j        j        t          j        j        fvrt          d|           |t          j        j        t          j        j        fvrt          d|           |t          j        j        t          j        j        fvrt          d|           | d         }	| d         }
| d	         }|d	         }t          |
|||          \  }}}|	||f}||z  |df}|	||f}||z  d|f}t          j        j	        }t          j                            |          }t          j        t          |                    5 \  }}|                    d
t!          |           t!          |          |          }|                    dt!          |          t!          |          |          }|                    d|||t          j        j                  }|                    d|||t          j        j                  }|                    ||d|gd          }|                    t          j        j                   |                    |||dgd          }|                    t          j        j                   |                    ||t          j        j        d          }|                    t          j        j                   |                    d                              |           |}|                    t2          j        j                   |                    t2          j        j                   |                    t2          j        j                   |                    t2          j        j                   |                    t2          j        j                   |                                  |!                                 |"                    t          j#        j$        t          j#        j%        g           |cd d d            S # 1 swxY w Y   d S )Nr  zA shape must be 3D, got zB shape must be 3D, got z)A type must be FP8_E4M3 or FP8_E5M2, got z)B type must be FP8_E4M3 or FP8_E5M2, got z&Output type must be BF16 or FP16, got r   r   r   rZ   r  r[   r  r  r  r  r  r  r  r  T)&rX  r   r7  r  r  rK  rH  rI  rY  r  rT   rU   r  r  r  r  rZ  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r	  r   r   ) r  r  r;  r  r  r<  r  r  r;   b_dimrk  r  rl  rV  rW  rX  r  r  r  r  r  r  r  r!  r  r  r  r  r  r  r  r  s                                    r0   'create_cudnn_execution_plans_mxfp8_gemmr\    s    7||q=G==>>>
7||q=G==>>>eo.0HIIIMVMMNNNeo.0HIIIMVMMNNNeo.0DEEEJ&JJKKK AJE
A
A
A 	$Aq!Z88 <(*; /1BCO--	 /1BCO--	 )JZ&&v..F	&v..	/	/ C:E1g??	 & 
 
 g??	 & 
 
 (-||"# !3< (4 (
 (
$ (-||"# !3< (4 (
 (
$ !77(:	 8 
 
 	&&u'<=== 77("A	 8 
 
 	&&u'<=== <<#o3	   
 
 	u4555 	D!!//777'tz/000tz/000$,,T-E-KLLL$,,T-E-KLLL$$TZ%5666##%%%$$eo&79J%KLLLGC C C C C C C C C C C C C C C C C Cs   KQQ!$Q!r  c                    t          | j        |                                 |j        |                                t          | j                  t          |j                  t          |          || j        	  	        }|                                 |dk    r|                    |           n|                                 |S )N)	r  r  r  r  r;  r<  r  r  r;   rP   )	r\  r   r  rL  r   r;   r#  r$  r%  )rZ   r[   r   r^   r  rQ   r  s          r0   _get_cudnn_mxfp8_gemm_graphr^  a  s     4217;;217;;29==x
 
 
E 
||!!&))))Lr2   c           
      b    d}t          | |||||          }	t          |	| ||||||           d S )Nr  )rZ   r[   r   r^   r  rQ   )r  rZ   r[   rL  rM  r  r_   rQ   )r^  r3  )
rZ   r[   rL  rM  r   r^   r_   rQ   r  r  s
             r0   _cudnn_gemm_mxfp8r`  }  sl     J (

  E #

)	 	 	 	 	 	r2   c                  <     G d dt                     }  |             S )Nc            	           e Zd Zdeej                 dedee         fdZ	 	 ddeej                 dede	dej        fd	Z
d
S )6_cudnn_gemm_mxfp8_runner.<locals>.CudnnMxfp8GemmRunnerrG   rH   rI   c                     dgS rK   rL   rM   s      r0   rO   zH_cudnn_gemm_mxfp8_runner.<locals>.CudnnMxfp8GemmRunner.get_valid_tactics  s     3Jr2   rP   FrQ   rR   c           
      P    |\  }}}}}	}
t          |||||	|	j        |
|           |	S )N)rZ   r[   rL  rM  r^   r   r_   rQ   )r`  r   rU  s              r0   ra   z>_cudnn_gemm_mxfp8_runner.<locals>.CudnnMxfp8GemmRunner.forward  sP     =C9Aq'7C)9!!)!1	 	 	 	 Jr2   Nrb   rc   rL   r2   r0   CudnnMxfp8GemmRunnerrc    s        	&	 )	 #Y		 	 	 	 #(		 	&	 	 !		 \	 	 	 	 	 	r2   rf  rl   )rf  s    r0   _cudnn_gemm_mxfp8_runnerrg    s=        }   <  !!!r2   c                     g }d|v r!|                     t                                 |s
J d            t          j                    }| |||||g}	|                    d|t
          |	          \  }
} |
|	|           d S )Nr7  r(  
mxfp8_gemmr)  )r   rg  r   r,  r-  r9  r:  s               r0   mxfp8_gemm_sm100rj    s     G,/11222/////7MOOEGWc+;<F%%%	 NFF F&((((((r2   c                 "    t                       dS r   r  r  s          r0   _cudnn_bmm_mxfp8_requirementrl    r  r2   c                 `    | t           j        t           j        fvrt          d|  d          dS )r  r  zP. Only torch.bfloat16 and torch.float16 are supported for MXFP8 GEMM operations.Nr  r  s    r0   _validate_mxfp8_output_dtypern    sJ    U^U]333^ ^ ^ ^
 
 	
 43r2   c                    | j         dk    s|j         dk    rt          d| j        d|j                  | j        d         |j        d         k    rt          d| j        d|j                  t          |           dS )	Nr  z*bmm_mxfp8 accepts 3d tensors, got A.shape=z and B.shape=r   r   z?K dimension (last dim of A) mismatch in bmm_mxfp8. got A.shape=z
, B.shape=T)r  r   r   rn  r  s          r0   _check_bmm_mxfp8_problem_sizerp    s     	v{{afkkWagWWQWWWXXXwqzQWQZ\ag\\RSRY\\
 
 	
 !'''4r2   c                 J    g }t           rd| v r|                    d           |S Nr7  )r  r   )	r   r   r   r  r  r   r^   r   r   s	            r0   _heuristic_func_bmm_mxfp8rs  	  s7      +7&777!!'***r2   c           	      H   |dk    rt          d|           t          st          d          |@t          j        | j        d         | j        d         |j        d         f| j        |          }t          d	t          | j                  }t          | |||||dg           |S )
a  BMM MXFP8

    Parameters
    ----------
    A: torch.Tensor
        Input tensor, shape (b, m, k), fp8 e4m3 or fp8 e5m2.

    B: torch.Tensor
        Mat2 tensor, shape (b, k, n), should be column major, fp8 e4m3 or fp8 e5m2.

    A_scale: torch.Tensor
        Scale tensor for A, uint8 (fp8 e8m0 format).

    B_scale: torch.Tensor
        Scale tensor for B, uint8 (fp8 e8m0 format).

    dtype: torch.dtype
        out dtype, bf16 or fp16.

    out: Optional[torch.Tensor]
        Out tensor, shape (b, m, n), bf16 or fp16, defaults to ``None``.

    backend: Literal["cudnn"]
        The backend to use for the operation. Defaults to ``"cudnn"``.

    Returns
    -------
    out: torch.Tensor
        Out tensor, shape (b, m, n), bf16 or fp16.
    r7  zInvalid backend: zcudnn is not availableNr   r   r   r   bmm_mxfp8_workspace)	r   r  rT   r   r   r;   r4   r   rj  )r   r   r  r  r   r^   r   r_   s           r0   	bmm_mxfp8rv    s    ` '6W66777 31222
{kWQZQWQZ08
 
 
 &5qx  Q7GS2BWINNNJr2   )FNr,   )rP   )Nr6  )Nr   r  NNr   )r   r   NN)r  r   r   NN)r   r   r   r   TNN)r  NNr  rr  )r  	functoolsenumr   typesr   typingr   r   r   r	   "flashinfer.trtllm_low_latency_gemmr
   rT   api_loggingr   	autotunerr   r   r   r   r   r   fused_moe.utilsr   r   utilsr   r   r   r   r   r   r   r   jit.gemmr   r   r   r    r!   r"   r#   r$   r%   r&   r(   jit.cpp_extr)   r*   r  r7  r  OSErrorr  r  lowerr-   anyis_lib_missingjit.cubin_loaderr3   r4   r5   r6   r7   r8   r9   r:   r   r  r;   r  rB   cacher   r   rg   r   ri   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r9  r$  r.  r   r;  rQ  rT  rW  rh   r\  r*  ru  r  r  r  r  r  r  r  r  r  rU   Streamr  r  r   r!  r&  r0  r3  rB  rF  rL  rO  r8  r`  rf  rm  r{  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rW   r  r  r  r   r  r  r  r  r  r  r  r  r  r  r$  r'  r,  r/  r:  rP  rY  r\  r^  r`  rg  rj  rl  rn  rp  rs  rv  rL   r2   r0   <module>r     s-               ! ! ! ! ! ! 1 1 1 1 1 1 1 1 1 1 1 1 F F F F F F  ( ( ( ( ( (                      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 , + + + + + & & & & & & , , , , , , , , , , , , 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 9 9 9 9 9 9 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 * * * * * * : : : : : : 
LLLOO 	 	 	D   AISEE_EEEEEN      2 1 1 1 1 1                  *  -T )%el %S	 % % % % Q Q Qh sCj)) #'"^#'16 || 
%,	 {	
 5<
  
 -.   *). sCj)) #'"^#'16 || 
%,	 {	
 5<
  
 -.   *)& $("&"^16 || 5<
  
	
 
%,	 { -.   < $("&"^16 Cy| | 5<
 	
 
 
%,	 { -.   , /$  -*    $("&"^16c c|c|c 5<
 c 
	c
 
%,	c {c -.c \c c c  cL sCj)) #'"^"+	 	|	|	 
%,		 {		
 Y	 	 	 *)	 #'"^"+ || 
%,	 {	
 Y   . #'"^"+ Cy| | 
%,		
 { Y    0 .+    #'"^"+E E|E|E 
%,	E {	E
 YE \E E E  EP   
   
 f f fR    $ $ $N !-2$		
 	
 	((	
 	
! ! ! & ! ! !H ".2$		
 	
 	((	
 	
" " " &)|)|) ,) 
	)
 
) l) s)) 
) ) ) )>)|)|) \) \	)
 
) l) s)) 
) ) ) )@/S /c / / / /d      FF F F F U1 1;16:1 1 1 1h 
 "&c0 c0|c0|c0 ,c0 
	c0
 
%,	c0 \c0 c0 c0 c0L < < <L .2; ;|;\; |; 	;
 ; ; U\*; ; ; ;J .2; ;|;\; |; 	;
 ; ; U\*; ; ; ;|~ ~ ~ ~ ~ ~ ~ ~B
 
 
 
 
4 
 
 

 
 
  B
 
 
 ej/    
ek 
 
 
 

u{ 
 
 
 
 a a aH   ( ( ( ( ( (h "
 "
 "
 "
 "
 "
Z  
  
  
  
  
  
F M M M`@ @ @,
8u{ 
8 
8 
8 
8|| | \	
 \ 
%,	 [   8     03 3 30A A A:  %)"^"&-Ak k|k|k EL!k {	k
 
%,	k )*k k k kf %)"^"&, ,|,|, |, |	,
 EL!, {, 
%,	, , , , , , ,h %)"^"&%) || | |	
 EL! { 
%,	   l    @G  G  G ^ %)"^"&#=C5 5|5|5 |5 |	5
 EL!5 {5 
%,	5 5 5 9:5 5 5 5 5p 77788 %)"^"&#=C9 9|9|9 |9 |	9
 EL!9 {9 
%,	9 9 9 9:9 9 9 9 989x sCj)) %)"^"&#=C || | |	
 EL! { 
%,	   9:    *). 77788 %)"^"&#=C || | |	
 EL! { 
%,	   9:    984 %)"^"&#=D$E $ECy$E|$E |$E |	$E
 |$E EL!$E {$E 
%,	$E $E $E 9:$E $E $E $E $EN" " " )L2$		
 	
 	33	
 	

 	''	
 	
   0 +l2$		
 	
 	55	
 	

 	''	
 	
   0 ,.0 
 ,)    %)"^"&#=CI I|I|I |I |	I
 EL!I {I 
%,	I I I 9:I I \I I I  IX :::;; #'=E
 
|
|
 \
 \	

 ;
 
%,	
 9:
 
 
 <;
 :::;; #'=E	 	|	|	 \	 \		
 ;	 
%,		 9:	 	 	 <;	 77788 #'=E || \ \	
 ; 
%,	 9:   98( #'=E
 
|
|
 \
 \	

 ;
 
%,	
 9:
 
 
 
( #'=E Cy| | \	
 \ ; 
%,	 9:   < +-/ 
 -*    #'=EX X|X|X \X \	X
 ;X 
%,	X 9:X \X X X  Xv 22233 6:2?"&'+,5 || \ \	
 wy12  !c3/ 
%,	 $ ()   43$ sCj)) 6:2?"&'+,5 || \ \	
 wy12  !c3/ 
%,	 $ ()   *)2 6:2?"&'+,5 || \ \	
 wy12  !c3/ 
%,	 $ ()   8 =;  ;    6:2?"&'+,5J J|J|J \J \	J
 wy12J J !c3/J 
%,	J $J ()J \J J J  JZ Q Q Qh 22233 6:"&'+$ $|$|$ \$ \	$
 wy12$ $ 
%,	$ $$ $ $ 43$N <    6:"&'+ || \ \	
 wy12  
%,	 $ \   	 
8 22233 3@+/"&'+A A|A|A \A \	A
 lA !c3/A i(A A 
%,	A $A A A 43AH @    3@+/"&'+u u|u|u \u \	u
 lu !c3/u i(u u 
%,	u $u \u u u 	 
up 77788 "&'+O O|O|O \O \	O
 lO O O O O O 
%,	O $O O O 98Od H    "&'+l l|l|l \l \	l
 ll l l l l l 
%,	l $l \l l l 	 
l` !D *l* * * *.   
 sCj)) 3@"&'+ || \ \	
 | !c3/ 
%,	 $ 
   *). D    3@"&'+z z|z|z \z \	z
 |z !c3/z 
%,	z $z z z 	 
zz sCj)) 3@"&'+ || \ \	
 l  !c3/ 
%,	 $ 
   *)0 7    3@"&'+ || \ \	
 l  !c3/ 
%,	 $   	 
D 	 	 	  +/+/"&'+V V<VLV %,'V 5<(	V
 
%,	V $V \V V V VrC
CCC(+C
3S=C C C C0 } } }F #^"& || { 
%,		
     B #^"&%)   | |  |  |	 
 {  
%,	  l         F" " "D)|)|) \) \	)
 
) l) s)) 
) ) ) )4 sCj)) #' '
 
|
|
 \
 \	

 ;
 
%,	
 W
 
 
 *)

 
 
 
 
 #' ' || \ \	
 ; 
%,	 W   : #' ' Cy| | \	
 \ ; 
%,	 W     - /,    #' ': :|:|: \: \	:
 ;: 
%,	: W: \: : :  : : :s   ,B3 3C=:C=>5C88C=