
    )`i/             %       f   d dl mZmZmZmZ d dlmc mZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmc mZ d dlmc mZ d dlZd dlZd dlmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d d	l/m0Z0 d d
l1m2Z2 ddlm3Z3m4Z4m5Z5m6Z6 d dl m7Z7m8Z8 dZ9e(dddde%de"de$de%fd            Z:e(dddde%de"de$fd            Z;e(dddde#de"de"fd            Z< G d d          Z= G d d          Z>	  G d d          Z?ej@        dejA        dejA        fd             ZBej@        dejA        fd!            ZCd" ZD G d# d$          ZEejF        d%eGd&eGd'eGd(eGd)eHd*eHd+eHd,ee	jI                 d-ee	jI                 d.ee	jI                 d/eee	jI                          d0eGd1eeGeGf         d2eeGeGf         d3eGd4eHd5eJde7f$d6            ZKe0ddd7d8eejA        ejA        f         d9eejA        ejA        f         d:ejA        d;ejA        d,eHd-eHd.eHd0eGd<eejA                 d3eeG         fd=            ZLdS )>    )OptionalTupleTypeUnionN)ir)cpasynctcgen05)from_dlpack)	Int32Int64Uint8Uint64TIntegerdsl_user_opextract_mlir_valuesnew_from_mlir_values)llvm)get_compute_capability)flashinfer_api)WorkTileInfo   )get_cutlass_dtypecutlass_to_torch_dtype
get_num_smmake_ptr)CallableList   locipobjindexvaluereturnc                x    | d|dz  z   z  } | ||dz  z  z  } t          | t                    sJ d|             | S )N      zobj=)
isinstancer   )r#   r$   r%   r!   r"   s        x/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/cute_dsl/blockscaled_gemm.py	with_byter,   C   sT    Teai !!C5UQYCc6""--IsII--"J    c                H    | |dz  z	  dz                       t                    S )Nr)   r(   )tor   )r#   r$   r!   r"   s       r+   	read_byter0   K   s$    UQY4'++E222r-   addrc                   t          t          j        t          j                    |                     ||          t          |                              ||          gddddt          j        j                            S )Nr    z-atom.add.release.gpu.global.s32 $0, [$1], $2;z=r,l,rTF)has_side_effectsis_align_stackasm_dialect)r   r   
inline_asmr   i32ir_value
AsmDialectAD_ATT)r1   r%   r!   r"   s       r+   atomic_add_release_globalr;   P   s{    EGG#"--e%%#"%55 <! .	
 	
 	
  r-   c                       e Zd Zddddej        deej                 dej        deeef         dej	        f
dZ
d	 Zd
 Zeddddedeeeef         fd            ZdS )MaskedSchedulerParamsNr    masked_mdst_signalscc_tilercluster_shape_mnkc                   |d         dk    rt          d|d                    t          j        ||          }|d         j        }	|| _        || _        || _        || _        |	| _        || _	        |d d         | _
        || _        t          j        t          j        | j        |d d         ||          ||          | _        d S )N   r   zunsupported cluster_shape_k )tiler)r   NNNr    )
ValueErrorcutezipped_divideshaper>   r?   r@   rA   problem_shape_ntile_mnl_cluster_shape_mnkcluster_shape_mn_locmake_layoutceil_divproblem_layout_ncluster_mnl)
selfr>   r?   r@   rA   rB   r!   r"   gcrK   s
             r+   __init__zMaskedSchedulerParams.__init__c   s     Q1$$R<Ma<PRRSSS111"$%<"="C &'>$"3 1"1" 5	+/+;M,.?.CQS   ,
 ,
 ,
(((r-   c                     g g c}| _         | j        | j        | j        | j        | j        fD ]=}t          |          }||z  }| j                             t          |                     >|S N)	_values_posr>   r?   r@   rA   rL   r   appendlen)rR   valuesr#   
obj_valuess       r+   __extract_mlir_values__z-MaskedSchedulerParams.__extract_mlir_values__   s{    #%r  MFL#
 		5 		5C -S11Jj F##C
OO4444r-   c           	      "   g }t          | j        | j        | j        | j        | j        g| j        d          D ]:\  }}|                    t          ||d |                              ||d          };t          t          |          d| j        iS )NTstrictr!   )zipr>   r?   r@   rA   rL   rW   rX   r   r=   tuplerN   )rR   rZ   obj_listr#   n_itemss        r+   __new_from_mlir_values__z.MaskedSchedulerParams.__new_from_mlir_values__   s     ' 

 

 

 	& 	&LC OO0fXgX6FGGHHHGHH%FF$uXGTYGGGr-   max_active_clustersr&   c                    |}g | j         |R S rV   )rM   )rR   re   r!   r"   num_persistent_clusterss        r+   get_grid_shapez$MaskedSchedulerParams.get_grid_shape   s!     #6@&@(?@@@r-   )__name__
__module____qualname__rH   Tensorr   Pointerr   intShaperT   r\   rd   r   r   r   rh    r-   r+   r=   r=   b   s          
  
  
+ 
 dl+ 
 ;	 

 sCx 
  : 
  
  
  
D  H H H" 15$A A A#(A	w(	)A A A [A A Ar-   r=   c                   z   e Zd Zdedededededej        defdZd	ee	j
                 fd
Zdee	j
                 d	d fdZeeddddedeeeef         deeeef         fd                        Zeddddeded	eeeef         fd            Zej        	 ddedee         dee         dee         d	eeee         f         f
d            Ze	 	 	 d ddddee         dee         dee         d	eeee         f         fd            Zedddd	efd            Zedddddefd            Zed	efd            ZdS )!MaskedSchedulerparamsrg   current_work_linear_idxcurrent_batch_idxaccum_tile_mcta_id_in_clusternum_tiles_executedc                 h    || _         || _        || _        || _        || _        || _        || _        d S rV   )rs   rg   _current_work_linear_idx_current_batch_idx_accum_tile_mrw   _num_tiles_executed)rR   rs   rg   rt   ru   rv   rw   rx   s           r+   rT   zMaskedScheduler.__init__   sA     '>$(?%"3)!2#5   r-   r&   c                    t          | j                  }|                    t          | j                             |                    t          | j                             |                    t          | j                             |                    t          | j                             |                    t          | j                             |S rV   )r   rg   extendrz   r{   r|   rw   r}   )rR   rZ   s     r+   r\   z'MaskedScheduler.__extract_mlir_values__   s    $T%ABB)$*GHHIII)$*ABBCCC)$*<==>>>)$*@AABBB)$*BCCDDDr-   rZ   c           	         t          |          dk    sJ t          | j        |d         g          }t          | j        |d         g          }t          | j        |d         g          }t          | j        |d         g          }t          | j        |dd                   }t          | j        |d         g          }t          | j	        ||||||          S )Nr)   r   r   rD      r      )
rY   r   rg   rz   r{   r|   rw   r}   rr   rs   )rR   rZ   new_num_persistent_clustersnew_current_work_linear_idxnew_current_batch_idxnew_accum_tile_mnew_cta_id_in_clusternew_num_tiles_executeds           r+   rd   z(MaskedScheduler.__new_from_mlir_values__   s    6{{a&:(6!9+'
 '
# ';)F1I;'
 '
# !5#fQi[!
 !
 00BVAYKPP 4"F1Q3K!
 !
 "6$vayk"
 "
 K''!!"
 
 	
r-   Nr    	block_idxgrid_dimc          	         | } t          j        |||          t          j        | j        ||          z  }|\  }}}t          |          }	t          d          }
t          d          }t          || j        d         z            t          || j        d         z            t          d          f}t          d          }t	          | ||	|
|||          S )Nr    r   r   )rH   sizerM   r   rr   )rs   r   r   r!   r"   rg   bidxbidybidzrt   ru   rv   rw   rx   s                 r+   createzMaskedScheduler.create   s      #')H#""E"E"E#J
 J
 J
 #
 %dD #(++!!HHQxx $03344$03344!HH
 #1XX##
 
 	
r-   re   c                2    |                      |||          S Nr    )rh   )rs   re   r!   r"   s       r+   rh   zMaskedScheduler.get_grid_shape  s      $$%8cb$IIIr-   dsm_pending_packeddsm_counternum_c_stagec                    | j         j        d         }| j        }| j        }|t	          j        | j         j        |         | j         j        d                   z   |z  |k    r|| j         j        j        d         k     rt          j
        |d uo| j         j        d u          rt          ||||dz
  z             }|t	          j        | j         j        |         | j         j        d                   z  }|t          d          z  }|t	          j        | j         j        |         | j         j        d                   z   |z  |k    r|| j         j        j        d         k     || _        || _        | j        | j         j        j        d         k     }|rI| j        t	          j        | j         j        | j                 | j         j        d                   z   |z  |k    }||z  | j        z
  ||z  | j        f}	t          d t          |	| j        g | j         j        t          d          R d          D                       }
t%          |
|          |fS )Nr   r   )r$   r%   c              3   |   K   | ]7\  }}}t          |          t          |          z  t          |          z   V  8d S rV   )r   ).0xyzs       r+   	<genexpr>zCMaskedScheduler._get_current_work_for_linear_idx.<locals>.<genexpr>[  sX       
 
1a !HHuQxx%((*
 
 
 
 
 
r-   Tr^   )rs   rK   r|   r{   rH   rP   r>   rA   rJ   cutlass
const_exprr?   r,   r   ra   r`   rw   rM   r   )rR   rt   r   r   r   num_tiles_nrv   	batch_idxis_validcur_cluster_coordcur_tile_coords              r+    _get_current_work_for_linear_idxz0MaskedScheduler._get_current_work_for_linear_idx  s    k9!<)+	 - 4Y ?ATUVAWXXY 	
 '' ' DK06q999!#4/ :[,D8   &/&#%q9& & &" DM$Y/1DQ1G  L q!I) - 4Y ?ATUVAWXXY 	
 '' ' DK06q999" *"+*T[-A-G-JJ 	6"-K()@AK'* 
  66H ${2T5GG#k1#
  
 
!&9$+.9a99	  
 
 
 
 
 NH557IIIr-   c                >    |                      | j        |||          S )Nr   r   r   )r   rz   )rR   r   r   r   r!   r"   s         r+   get_current_workz MaskedScheduler.get_current_workg  s0     44)1##	 5 
 
 	
r-   c                :    |                      ||          \  }}|S r   )r   )rR   r!   r"   	tile_info_s        r+   initial_work_tile_infoz&MaskedScheduler.initial_work_tile_infox  s$    ,,,<<	1r-   r   )advance_countr!   r"   r   c                    | xj         t          |          t          | j                  z  z  c_         | xj        t          d          z  c_        d S Nr   )rz   r   rg   r}   )rR   r   r!   r"   s       r+   advance_to_next_workz$MaskedScheduler.advance_to_next_work}  s[    %%})=)=(A
 A
 *
 	
%% 	  E!HH,    r-   c                     | j         S rV   )r}   rR   s    r+   rx   z"MaskedScheduler.num_tiles_executed  s    ''r-   rV   rF   )ri   rj   rk   r=   r   rH   CoordrT   listr   Valuer\   rd   r   staticmethodr   r   r   rh   jitr   r   r   rn   r   r   r   r   r   propertyrx   rp   r-   r+   rr   rr      s.       6%6 "'6 "'	6
 !6 6  :6 "6 6 6 6$bh    
tBH~ 
BS 
 
 
 
:  '
 '
 '
%'
'723'
 '12'
 '
 '
 \ ['
T 
 J J J%J"J 
w(	)J J J \J 
X &*GJ GJ!&GJ %V,GJ e_	GJ
 c]GJ 
|Xf--	.GJ GJ GJ XGJR  04'+%)	
 
 
 
$V,
 e_
 c]	
 
|Xf--	.
 
 
 [
  ,0T   l    [ ;<$4 - - -S - - - [- (E ( ( ( X( ( (r-   rr   c            -       f   e Zd ZdZdedeeef         deeef         defdZd Ze	j
        de	j        d	e	j        d
e	j        de	j        de	j        de	j        dee	j                 dee	j                 dej        dej        fd            Ze	j        de	j        de	j        de	j        de	j        de	j        de	j        de	j        de	j        de	j        de	j        dee	j                 de	j        dee	j                 d e	j        d!e	j        d"e	j        d#e	j        d$e	j        d%e	j        d&ee	j        e	j        d'f         d(e	j        d)ef,d*            Zd+e	j        d,e	j        d-ee	j        e	j        e	j        f         fd.Zd/ej        d0e	j        d1e	j        d(e	j        d2eej        ef         d-ee	j        e	j        e	j        f         fd3Z d4e	j        d5e	j        d/ej        d6e	j        d-ee	j        e	j        e	j        f         f
d7Z!d/ej        d8ee	j        e	j        f         d1e	j        d(e	j        d6e	j        d-ee	j        e	j        e	j        f         fd9Z"e#de	j        d:eeeef         d;e$ej%                 d<e&j'        d=e$ej%                 d>e&j'        d(e	j        d?e$ej%                 d@e(j)        dAe$ej%                 dedBedCed-eeeef         fdD            Z*e#de	j        dee	j                 dEe	j        dFeeeef         deeef         dej        d-eeeeeef         f         fdG            Z+e#dHe$ej%                 dAe$ej%                 ded?e$ej%                 d-ef
dI            Z,e#dHe$ej%                 d?e$ej%                 dJedKedLed-efdM            Z-e#deeef         deeef         d-efdN            Z.e#dOedPedQedRedHe$ej%                 d?e$ej%                 dJedKedLed-efdS            Z/e#dHe$ej%                 dAe$ej%                 ded?e$ej%                 deeef         deeef         dOedPedQedRedJedKedLed-efdT            Z0d'S )U)Sm100BlockScaledPersistentDenseGemmKernelak  This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param sf_vec_size: Scalefactor vector size.
    :type sf_vec_size: int
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16

    :note: Supported accumulator data types:
        - Float32

    :note: Supported C data types:
        - Float32
        - Float16/BFloat16
        - Float8E4M3FN/Float8E5M2
    :note: Constraints:
        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
        - MMA tiler N must be 128/256
        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors

    Example:
        >>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
        ...     sf_vec_size=16,
        ...     mma_tiler_mn=(256, 128),
        ...     cluster_shape_mn=(2, 1)
        ... )
        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
    sf_vec_sizemma_tiler_mnrM   
sm_versionc                    ddg}||v sJ | d|             t           j        | _        || _        |d         dk    | _        || _        g |dR | _        | j        rt          j        j	        nt          j        j
        | _        d| _        d| _        d| _        d	| _        d
t!          | j        | j        g| j        R           z  | _        d| _        d| _        d| _        t+          j        |          | _        d}|| _        dS )a@  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator, always set to Float32
            - sf_vec_size: Scalefactor A/B vector size.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.

        2.  Cluster Shape:
            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.

        :param sf_vec_size: Scalefactor vector size.
        :type sf_vec_size: int
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mn: Tuple[int, int]
        sm_100sm_103zJ are the only supported SM versions for cute-dsl backend, but encountered r      r   )r   r   rD   r   r          rD   i   N)r   Float32	acc_dtyper   use_2cta_instrsrM   	mma_tilerr	   CtaGroupTWOONE	cta_group	occupancyepilog_warp_idmma_warp_idtma_warp_idrY   threads_per_ctacta_sync_bar_idepilog_sync_bar_idtmem_ptr_sync_bar_idutilsget_smem_capacity_in_bytessmem_capacitynum_tmem_alloc_cols)rR   r   r   rM   r   supported_sm_versionsSM100_TMEM_CAPACITY_COLUMNSs          r+   rT   z2Sm100BlockScaledPersistentDenseGemmKernel.__init__  s8   4 "*8 42222$||pz|| 322 !&+A#5 0+<+++ %)$8RG  g>N>R 	 
 !Ct/F$2EFF%
 %
  
  !"#$%!"=jII&)##>   r-   c                    d}| j         d         | j         d         || j        j        z  f| _        | j        d         | j        rdndz  t          j        | j        d         d          | j        d         f| _        t          j	        | j        | j
        | j        | j        | j        | j        | j        dd                   }t          j	        | j        | j
        | j        | j        | j        t
          j        j        j        j        | j        dd                   }d}| j        d         | j        d         | j        d         |z  f| _         | j        d         | j        d         | j        d         |z  f| _        | j         d         t          j        |j        j                  z  | j         d         | j         d         f| _        t          j        t          j        g | j        dR           |j        j        f          | _        t          j        t          j        g | j        dR           |j        j        f          | _        t          j        | j        j        d                   | _        t          j        | j        j        d                   | _        t          j        | j        j        d                   | _        | j        dk    | _         | j        dk    | _!        | j        dk    | _"        t          j#        | j        | j        | j$        | j%                  | _&        | '                    || j         | j        | j
        | j(        | j        | j&        | j%        | j$        | j        | j        | j)        | j*                  \  | _+        | _,        | _-        t          j.        || j         | j        | j,                  | _/        t          j0        || j         | j(        | j,                  | _1        te          j3        || j         | j        | j,                  | _4        te          j5        || j         | j        | j,                  | _6        t          j7        | j%        | j$        | j&        | j-                  | _8        dS )aJ  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B/SFA/SFB
        - Computing epilogue subtile
        - Setting up A/B/SFA/SFB/C stage counts in shared memory
        - Computing A/B/SFA/SFB/C shared memory layout
        - Computing tensor memory allocation columns
        r   r   r   rD      Nr   )9r   a_dtypewidthmma_inst_shape_mnkr   rH   round_upmma_inst_shape_mnk_sfbsm100_utils"make_blockscaled_trivial_tiled_mmaa_major_modeb_major_modesf_dtyper   r   nvgpur	   r   r   mma_tiler_sfbr   thr_idrJ   cta_tile_shape_mnktiled_dividerO   rM   cluster_layout_vmnkcluster_layout_sfb_vmnknum_mcast_ctas_anum_mcast_ctas_bnum_mcast_ctas_sfb
is_a_mcast
is_b_mcastis_sfb_mcastcompute_epilogue_tile_shapec_layoutc_dtypeepi_tile_compute_stagesb_dtyper   r   num_acc_stagenum_ab_stager   make_smem_layout_aa_smem_layout_stagedmake_smem_layout_bb_smem_layout_stagedblockscaled_utilsmake_smem_layout_sfasfa_smem_layout_stagedmake_smem_layout_sfbsfb_smem_layout_stagedmake_smem_layout_epic_smem_layout_staged)rR   mma_inst_bits_k	tiled_mmatiled_mma_sfbmma_inst_tile_ks        r+   _setup_attributesz;Sm100BlockScaledPersistentDenseGemmKernel._setup_attributes:  s9     N1N1t|11#
 #A&0D+K11!LM$1!4c::#A&'
#  BLMN#BQB'
 
	 $FLMJ'+'+
 
 #A&#A&#A&8
 '*'*'*_<
 N19+;+A!B!BBN1N1#
 $(#48t48a8899#%$
 $
  (,'88t48a8899!')(
 (
$ !%	$*B*H*K L L $	$*B*H*K L L"&)D,H,Nq,Q"R"R/!3/!3 3a7 $?# ML	
 
 CGBVBVNLLMLMMNC
 C
?D-t/?" %0$BNL	%
 %
! %0$BNL	%
 %
! '8&LN	'
 '
# '8&LN	'
 '
# %0$DLMM	%
 %
!!!r-   a_tensorb_tensor
sfa_tensor
sfb_tensorc_tensormasked_m_tensorr?   alpha_tensorre   streamc                     |j          _        |j          _        |j          _        |j          _        t
          j                            |                                           _	        t
          j                            |                                           _
        t
          j                            |           _        t          j         j         j        k              rt          d j         d j                                                     t!          j        |j         j                  }t)          j        |j        |          }t!          j        |j         j                  }t)          j        |j        |          }t/          j         j         j	         j
         j         j         j         j        dd                   }t/          j         j         j	         j
         j         j        t(          j        j        j        j         j        dd                   }t)          j         |j!        j                  }t/          j"         j#        |j!                  }t)          j$         j%        d          }t(          j        &                    ||| j'        | j(        j                  \  }}t/          j)         j#        |j!                  }t)          j$         j*        d          }t(          j        +                    ||| j'        | j(        j                  \  }}t/          j"         j#        |j!                  }t)          j$         j,        d          }t(          j        &                    ||| j'        | j(        j        t          j-                  \  }}t/          j.         j#        |j!                  }t)          j$         j/        d          }t(          j        +                    ||| j0        | j1        j        t          j-                  \  }}t)          j2         j        |          } t)          j2         j        |          }!t)          j2         j        |          }"t)          j2         j        |          }#| |!z   |"z   |#z   |z   _3        t)          j$         j4        d          }$tk          j6        tk          j7                    ||$ j8                  \  }%}& 9                    ||| j:         j#        |	          \   _;        }'d _<        t(          j=         G  fd	d
                      }(|( _>         ?                    |||||||||||%|&| j(         j1         j%         j*         j,         j/         j4         j8         j;                  @                    |' jA        ddgg  j#        dR  j>        2                                |
           dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param a_tensor: Input tensor A
        :type a_tensor: cute.Tensor
        :param b_tensor: Input tensor B
        :type b_tensor: cute.Tensor
        :param sfa_tensor: Scale factor tensor A
        :type sfa_tensor: cute.Tensor
        :param sfb_tensor: Scale factor tensor B
        :type sfb_tensor: cute.Tensor
        :param c_tensor: Output tensor C
        :type c_tensor: cute.Tensor
        :param masked_m_tensor: Masked layout tensor M
        :type masked_m_tensor: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :param alpha_tensor: Optional 1D tensor of shape (l,) containing per-batch scaling factors.
        :type alpha_tensor: cute.Tensor
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        zType must match: z != NrD   NNNr   )internal_typeNNr      c                      e Zd ZU ej        j        ej         j        f         e	d<   ej        j        ej         j        f         e	d<   ej        j        ej         j
        f         e	d<   ej        j        ej         j
        f         e	d<   ej        e	d<   ej        e	d<   ej        j        ej        j         j         ej         j        j                  f          j        f         e	d<   ej        j        ej        j         j         ej         j        j                  f          j        f         e	d<   ej        j        ej        j         j         ej         j        j                  f          j        f         e	d	<   ej        j        ej        j         j         ej         j                  f          j        f         e	d
<   ej        j        ej        j         j         ej         j                  f          j        f         e	d<   dS )ISm100BlockScaledPersistentDenseGemmKernel.__call__.<locals>.SharedStorageab_full_mbar_ptrab_empty_mbar_ptracc_full_mbar_ptracc_empty_mbar_ptrtmem_dealloc_mbar_ptrtmem_holding_bufsCsAsBsSFAsSFBN)ri   rj   rk   rH   structMemRanger   r   r   __annotations__r   r   Alignr   cosizer  outerbuffer_align_bytesr   r   r   r   r   r   r  r   s   r+   SharedStorager    sS        "k27=$BS3STTTT#{3GM4CT4TUUUU#{3GM4CU4UVVVV $ 4W]DDV5V WWWW#*=000%m+++!$LDK 9 ?@@B '	)    !$L+$+d.G.M"N"NN ')    !$L+$+d.G.M"N"NN ')    +#$M;4;t/J#K#KK ')    +#$M;4;t/J#K#KK ')     r-   r+  r   )gridblockclustersmemr  )Belement_typer   r   r   r   r   
LayoutEnumfrom_tensormma_major_moder   r   r   r   r   	TypeErrorr	  r   tile_atom_to_shape_SFrJ   r   rH   make_tensoriteratorr   r   r   r   r   r	   r   r   r   r   r   cluster_shape_to_tma_atom_ArM   slice_r   make_tiled_tma_atom_Ar   r   cluster_shape_to_tma_atom_Br   make_tiled_tma_atom_Br   Int16cluster_shape_to_tma_atom_SFBr  r   r   size_in_bytesnum_tma_load_bytesr  r   make_tiled_tma_atomCopyBulkTensorTileS2GOpr   _compute_gridr   tile_sched_paramsr*  r$  shared_storagekernellaunchr   ))rR   r
  r  r  r  r  r  r?   r  re   r  
sfa_layout
sfb_layoutr  r  atom_thr_sizea_opa_smem_layout
tma_atom_atma_tensor_ab_opb_smem_layout
tma_atom_btma_tensor_bsfa_opsfa_smem_layouttma_atom_sfatma_tensor_sfasfb_opsfb_smem_layouttma_atom_sfbtma_tensor_sfba_copy_sizeb_copy_sizesfa_copy_sizesfb_copy_sizeepi_smem_layout
tma_atom_ctma_tensor_cr,  r+  s)   `                                        r+   __call__z2Sm100BlockScaledPersistentDenseGemmKernel.__call__  s   T /7.C.6.C/9/F.6.C!,88BBQQSS!,88BBQQSS(44X>> dldl:;; 	RPPP$,PPQQQ 	    '<ND,
 

 %j&9:FF
 '<ND,
 

 %j&9:FF
BLMN#BQB'
 
	 $FLMJ'+'+
 
 	)"2"899 6!9#3
 
 D$=?TUU#':#C#CN$*$
 $
 
L 6!9#3
 
 D$=?TUU#':#C#CN$*$
 $
 
L 8!9#3
 
 +')>
 
 (,z'G'GN$*!- (H (
 (
$n :!9#3
 
 +')>
 
 (,z'G'G(.!- (H (
 (
$n (}EE(}EE*4=/JJ*4=/JJ+%5E#
 +d&?QQ#*#>+--M	$
 $
 
L (,'9'9#!(
 (
$ #' 
*	 *	 *	 *	 *	 *	 *	 *	 *	 
*	X , 	$(%%''%M"-	
 	
. &'A./d+/Q//$2244  
 
 
 	r-   r  r  rM  mA_mklrQ  mB_nklrU  mSFA_mklrY  mSFB_nklr`  mC_mnlalphar   r   r   r   r   r  r  Nr   rD  c                 n7   t           j                                        }t           j                            |          }|| j        k    rdt          j        |           t          j        |           t          j        |           t          j        |	           t          j        |           t          j        |j        j	                  dk    }t           j        
                                \  }}}|t          j        |j        j	                  z  }|dk    }t           j                            t           j                                                  }|                    |          }|                    |          } t           j                                        \  }!}"}"t          j                    }#|#                    | j                  }$|$j        }%|$j        }&t)          j        t(          j        j                  }'| j        | j        z   dz
  }(t)          j        t(          j        j        |(          })t(          j                            |$j                                        | j        |'|)| j        |          }*t)          j        t(          j        j                  }+tA          | j!                  |rdndz  },t)          j        t(          j        j        |,          }-t(          j"                            |$j#                                        | j$        |+|-|          }.|rc|| j        k    rXd}/t           j        %                                5  t           j        &                    |%|/           ddd           n# 1 swxY w Y   t           j        '                                 t          j        | j(                  dk    rt           j        )                                 |$j*        +                    |j,        |j-                  }0|$j.        +                    |j,        |j-                  }1|$j/        +                    |j,        |j-                  }2|$j0        +                    |          }3|$j1        +                    |          }4d}5d}6d}7d}8te          j3        | j4        p| j5        p|          r\t          j6        ||d	          }5t          j6        ||d	          }6t          j6        ||d	          }7t          j6        || d	          }8t          j7        |t          j8        | j9        d
          d          }9t          j7        |t          j8        | j9        d          d          }:t          j7        |t          j8        | j9        d
          d          };t          j7        |
t          j8        | j9        d          d          }<t          j7        |t          j8        | j9        d          d          }=t          j        |9dg          }>|:                    |          }?|:                    |          }@|?;                    |9          }A|?<                    |:          }B|?;                    |;          }C|@<                    |<          }D|?=                    |=          }Et          j>        t          j8        |d          j	                  }Ft          j?        ||d         |Ft          j@        |1dd          t          j@        |Add                    \  }G}Ht          j>        t          j8        |d          j	                  }It          j?        ||d         |It          j@        |2dd          t          j@        |Bdd                    \  }J}K|F}Lt           jA        j        ?                    ||d         |Lt          j@        |3dd          t          j@        |Cdd                    \  }M}Nt          jB        |M          }Mt          jB        |N          }Nt          j>        t          j8        |d          j	                  }Ot           jA        j        ?                    |	| d         |Ot          j@        |4dd          t          j@        |Ddd                    \  }P}Qt          jB        |P          }Pt          jB        |Q          }Q|C                    |1          }R|D                    |2          }S|E                    | j9        dd                   }T|F                    t          jG        |T| j$                            }Ut          j        | j(                  dk    rt           j        H                                 n+t           j        I                    | jJ        | jK                   || j        k    rYt                              |t           j        
                                t           j        M                                          }V|VN                                }Wt)          jO        t(          jP        jQ        | j                  }X|WjR        r|WjS        }Y|Yd         t          j        |j        j	                  z  |Yd         |Yd         f}Z|Hd|Zd         d|Zd         f         }[|Kd|Zd         d|Zd         f         }\|Nd|Zd         d|Zd         f         }]|Qd|Zd         d|Zd         f         }^|XT                                 te          jU        d          }_|XjV        |>k     r|*W                    |X          }_te          jX        d|>dd          D ]y}`|*Y                    |X|_           t          jZ        ||[d|XjV        f         |Gd|Xj[        f         |*\                    |X          |5           t          jZ        ||\d|XjV        f         |Jd|Xj[        f         |*\                    |X          |6           t          jZ        ||]d|XjV        f         |Md|Xj[        f         |*\                    |X          |7           t          jZ        |	|^d|XjV        f         |Pd|Xj[        f         |*\                    |X          |8           |X]                                 te          jU        d          }_|XjV        |>k     r|*W                    |X          }_{|V^                                 |V_                                \  }W}"|WjR        |*`                    |X           || ja        k    rdtA          | ja        g| j!        R           z  }at           j        I                    | jb        |a           t           j        c                    | jd        d|&          }bt          je        |b|Ujf                  }ct          jg        |bt          ji        |c          z   | jj                  }dt          jl        || j9        | jm        t          j8        |d                    }et          je        |d|e          }ft          jg        |bt          ji        |c          z   t          ji        |f          z   | jj                  }gt          jn        || j9        | jm        t          j8        |d                    }ht          je        |g|h          }i| o                    |3|f          \  }j}k}l| o                    |4|i          \  }m}n}ot                              |t           j        
                                t           j        M                                          }V|VN                                }Wt)          jO        t(          jP        jp        | j                  }pt)          jO        t(          jP        jQ        | j$                  }q|WjR        r|WjS        }Y|Yd         t          j        |j        j	                  z  |Yd         |Yd         f}Z|cddd|qj[        f         }r|pT                                 te          jU        d          }s|pjV        |>k     r|r|*q                    |p          }s|r|.Y                    |q           |r                    t          js        jt        d           te          ju        |>          D ]}`|r\|*v                    |p|s           dddd|pj[        f}t|k|t         }u|n|t         }vt          jZ        |j|u|l           t          jZ        |m|v|o           t          j        |Rdg          }wte          jX        |wd          D ]}xdd|x|pj[        f}ydd|xf}z|r                    t          js        jw        |f|z         jx                   |r                    t          js        jy        |i|z         jx                   t          jz        ||r|R|y         |S|y         |r           |r                    t          js        jt        d           |*{                    |p           |p]                                 te          jU        d          }s|pjV        |>k     r|r|*q                    |p          }s|r|.|                    |q           |q]                                 |V^                                 |V_                                \  }W}"|WjR        |.`                    |q           || ja        k     	r|| j!        d         k    r't           j        }                    | j~        |&|           dtA          | ja        g| j!        R           z  }at           j        I                    | jb        |a           t           j        c                    | jd        d|&          }bt          je        |b|Ujf                  }c|!}{|                     |{|c|E||          \  }|}}}~t          j        |~j	        | j                  }|                     ||||{|0          \  }}}|                     |{||E||0          \  }}}t                              |t           j        
                                t           j        M                                          }V|VN                                }Wt)          jO        t(          jP        jp        | j$                  }t)          j        t(          j        j        dtA          | j!                  z  dtA          | j!                  z            }t(          j                            | j        |          }te          j3        |j        du          r7| j        dk     s
J d            |j        j	        d         }|d k    s
J d!            t          d          }t          d          }t          d          }|WjR        r|WjS        }Y|Yd         t          j        |j        j	                  z  |Yd         |Yd         f}Z|dddg|ZR          }|}ddddd|j[        f         }|.v                    |           t          j@        |dt          j        |                    }t          j@        |dt          j        |                    }t          j        |j	        dg          }|Vj        |z  }te          jX        |          D ]}|ddd|f         }t          jZ        ||||~           |                    |~                                          }te          j3        |du          r|||WjS        d                  z  }|                    | j                  }|                    |           ||z   | j        z  }t          jZ        |||ddd|f                    t           j                            t           j        j        j        t           j        j        j        "           dtA          | j!                  z  }t           j        I                    | j        |           || j!        d         k    rt          jZ        ||d|f         |d|f                    ||                                 te          j3        |j        du          rt|dz                       t                    }t/          ||          |k    }|r*t           j                            | j        dz
  d#           n)|Y                                 n|Y                                 t           j        I                    | j        |           te          j3        |j        du          r|!dz  }|| j!        d         k    rv|dk    rp||k     rjt/          ||          |k    rUt3          |j                                        t6          |z  z   d$           |dz  }||k     rt/          ||          |k    Ut           j        %                                5  |.{                    |           ddd           n# 1 swxY w Y   |]                                 |V^                                 |V_                    ||| j        %          \  }W}|WjR        || j!        d         k    r t           j                            |           dtA          | j!                  z  }t           j        I                    | j        |           || j!        d         k    rl|rCt           j                            |%|dz             t           j                            |%d           t           j                            |b| j~        |           te          j3        |j        du          rt           j                            dd#           |!dz  }|| j!        d         k    rN|dk    rJ||k     rFt3          |j                                        t6          |z  z   d$           |dz  }||k     @dS dS dS dS |`                                 dS dS )&zW
        GPU device kernel performing the Persistent batched GEMM computation.
        rD   r   r   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)rj  rk  rl  rm  ro  r   N)swizzle)
mcast_mode)Nr   NrF   )r   NNr  r   )mode)r   r   Nr   )r   Nr   r   )
barrier_idnumber_of_threads)unroll)tma_bar_ptr
mcast_mask   )	alignmentptr_to_buffer_holding_addr)dtyper  FT)unroll_full)
is_two_cta)rk  rl  r   zmust be representable in 1 byter)   zneed to be packable into a u64)space)read)r%   r   )rH   archwarp_idxmake_warp_uniformr   r   prefetch_descriptorr   r   rJ   r   block_idx_in_clusterget_flat_coord
thread_idxr   SmemAllocatorallocaterE  r  r  pipelineCooperativeGroupAgentThreadr   r   PipelineTmaUmmar   r  data_ptrr   r@  rY   r   PipelineUmmaAsyncr  r   	elect_onembarrier_initmbarrier_init_fencerM   cluster_arrive_relaxedr  
get_tensorr)  innerr   r!  r"  r#  r   r   r   r   create_tma_multicast_mask
local_tiler9  r   	get_slicepartition_Apartition_Bpartition_CrO   tma_partitiongroup_modesr   filter_zerosmake_fragment_Amake_fragment_Bpartition_shape_Cmake_fragment_CrX   cluster_waitbarrierr   r   rr   r   r   make_pipeline_statePipelineUserTypeProduceris_valid_tiletile_idxreset_countBooleancountproducer_try_acquirerangeproducer_acquirecopyr$   producer_get_barrieradvancer   r   producer_tailr   r   retrieve_tmem_ptrr   r6  layout
recast_ptrr	   find_tmem_tensor_col_offsetr   r   make_tmem_layout_sfar   make_tmem_layout_sfbmainloop_s2t_copy_and_partitionConsumerconsumer_try_waitsetField
ACCUMULATErange_constexprconsumer_waitSFAr7  SFBgemmconsumer_releaseproducer_commit
alloc_tmemr   epilog_tmem_copy_and_partitionmake_fragmentr   epilog_smem_copy_and_partitionepilog_gmem_copy_and_partitionPipelineTmaStorer   r?   r>   r   r   r   rankrx   retileloadr/   storefence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctar   r0   cp_async_bulk_wait_groupr;   toint
sizeof_i32relinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmem)rR   r  r  rM  rc  rQ  rd  rU  re  rY  rf  r`  rg  rh  r   r   r   r   r   r  r  r   rD  r  r   r   r   r   mma_tile_coord_vis_leader_ctacta_rank_in_clusterblock_in_cluster_coord_vmnkblock_in_cluster_coord_sfb_vmnktidxr   r/  storager  r  ab_pipeline_producer_groupnum_tma_producerab_pipeline_consumer_groupab_pipelineacc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_groupacc_pipelinenum_tmem_dealloc_threadsr  r   r!  r"  r#  a_full_mcast_maskb_full_mcast_masksfa_full_mcast_masksfb_full_mcast_maskgA_mklgB_nklgSFA_mklgSFB_nklgC_mnlk_block_cntthr_mmathr_mma_sfbtCgAtCgBtCgSFAtCgSFBtCgCa_cta_layouttAsAtAgAb_cta_layouttBsBtBgBsfa_cta_layouttAsSFAtAgSFAsfb_cta_layouttBsSFBtBgSFBtCrAtCrB	acc_shapetCtAcc_fake
tile_sched	work_tileab_producer_stater   mma_tile_coord_mnl
tAgA_slice
tBgB_slicetAgSFA_slicetBgSFB_slicepeek_ab_empty_statusk_blocktmem_ptr_read_threadsacc_tmem_ptrtCtAcc_basesfa_tmem_ptrtCtSFA_layouttCtSFAsfb_tmem_ptrtCtSFB_layouttCtSFBtiled_copy_s2t_sfatCsSFA_compact_s2ttCtSFA_compact_s2ttiled_copy_s2t_sfbtCsSFB_compact_s2ttCtSFB_compact_s2tab_consumer_stateacc_producer_statetCtAccpeek_ab_full_statuss2t_stage_coordtCsSFA_compact_s2t_stagedtCsSFB_compact_s2t_stagednum_kphases
kphase_idxkphase_coordsf_kphase_coordepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcctTR_rCtiled_copy_r2stRS_rCtRS_sCbSG_sCbSG_gC_partitionedacc_consumer_statec_producer_group
c_pipelinenum_expertsr   dsm_pending_idxr   bSG_gCtTR_tAccsubtile_cntnum_prev_subtilessubtile_idxtTR_tAcc_mnacc_vecc_bufferepilog_threadswill_write_signalslane_ids                                                                                                                                                          r+   rF  z0Sm100BlockScaledPersistentDenseGemmKernel.kernel  s   : 9%%''9..x88
 t''''
333'
333'555'555'
333)I$4$:;;q@  9..00dD$)I,<,B"C"CC(A-"i99I**,,
 
 ':&H&H'
 '
# +B*P*P+
 +
' Y))++
a
 "$$-- 344 ' ="3 &.%>x~?T%U%U"043HH1L%-%>N!#3&
 &
" .55#4==??(55,/ 6 
 
 '/&?@U&V&V##&t':#;#; 'AAa$
  '/&?N!#;'
 '
#  188#5>>@@)66/ 9 
 
  	4++++-(Y((**  I++-/G                 		%%''' 9T*++a//I,,... Z"" &0D0J # 
 
 Z"" &0D0J # 
 
 Z"" &0D0J # 
 
 |&&'=>>|&&'=>>
 ! ""doSSOTT 	 ' A#%@Q! ! ! !( A#%@Q! ! ! #*"C#%@Q# # # #*"C')HUV# # # DK@@BT
 
 DK@@BT
 
 ?dk$./BBDV
 
 ?dk$./BBDV
 
 DK@@BT
 
 iaS111
 %%&677#--.>??""6**""6**$$X..((22""6** 'K+_==C
 

 *'*RA&&T1a((
 

d 'K+_==C
 

 *'*RA&&T1a((
 

d & +99'*T1a((VQ**
 
 "6**"6** )K/AAG
 

 +99+A.T1a((VQ**
 
 "6**"6** ((,,((,,//rr0BCC	//K	4#566
 
 9T*++a//I""$$$$I/4CW     t''' )//!49#6#6#8#8$):L:L:N:N J #99;;I ( <)2D4E! ! ) X=!*!3"1%93C3I)J)JJ"1%"1%&" "-a0$8J18MN
 "-a0$8J18MN

  &-a0$8J18MN   &-a0$8J18MN 
 "--///'.q'9'9$$*[88+6+K+K), ,(  '}QQqIII * *G00)+?  
 I""D*;*A#BCd$5$;<=$/$D$DEV$W$W#4    I""D*;*A#BCd$5$;<=$/$D$DEV$W$W#4    I$$d,=,C%DE&7&=>?$/$D$DEV$W$W#6    I$$d,=,C%DE&7&=>?$/$D$DEV$W$W#6    &--///+2?1+=+=((.<</:/O/O-0 0, //111)::<<	1q ) X=z %%&7888
 t''' %'d.>-UAT-U-U)V)V$V!I4"7      966+; 7  L *<9KLLK  ?wB;OOOm  L
 .B 24IJJ	 M %lMBBF  ?5kBBC5f==> m	  L .B 24IJJ	 M %lMBBF
 44T6BB G 24F 44T6BB G 24F )//!49#6#6#8#8$):L:L:N:N J #99;;I ( <)2D4E! ! "*!=)2D4F" " ) v=!*!3"1%93C3I)J)JJ"1%"1%&" %dD$8J8P%QR "--///&-oa&8&8#$*[88]8*5*G*G)+ +' ! F 112DEEE
 gm6>>>
  '6{CC F FG$ <H#11-/B   !   -3+ 5G4W14F4W1	.5.  
 	.5.   '+iA3&?&?&?*1-QU*V*V*V J JJ $ $ * 1 7	,L 04T:.FO%MM ' 1 & 7 @   &MM ' 1 & 7 @  
 !I ) & $\ 2 $\ 2 &   &MM'-*BDIIII $445FGGG &--///*1/!*<*<'(.<<( 2=2O2O 13 3/ ! E 001CDDD"**,,,
 //111)::<<	1m ) v=v &&'9::: d&&& 4.q111	$$,$. %    %'d.>-UAT-U-U)V)V$V!I4"7      966+; 7  L *<9KLLK
 H33k4?  4NM8 'EEF-1-P-P". .*NFF 33j$"  3J 2 )//!49#6#6#8#8$):L:L:N:N J #99;;I!)!=)2D4F" "
  (8%S,---S,---   
 "299+/ :  J
 !"3"?t"KLL J'#---/P---/8>qA"a''')I'''!'#AhhO((K) V!*!3"1%93C3I)J)JJ"1%"1%&" , ,	  )4tT3E3KL **+=>>>+Ha89L9LMM)&!TYv5F5FGG
 #iaSAAA$.$AK$O!#*=#=#= \5 \5K #+D$k+J"KKInk8DDD
 -33H==BBDDG)%t*;<< I")E)2DQ2G,H"H%jj66GLL)))
 !2K ?4CSSHI&dD(;<   I))	+8"i3> *    &(#d.A*B*B%BNI%%#'#:*8 &     4#6q#999	&"D(#34"D+#67   #22444"--9E  : ,7?*>*>u*E*EK )*<o N N#.!/ /
  2 >
 !%	 B B$($4q$8). !C !" !" !" !" !+ ; ; = = = = '77999I%%#'#:*8 &   
 )*;*Gt*STT 5"&)#t':1'==='Q,,#2[#@#@ )*<o N N#.!/ !/ !:$5$A$G$G$I$I&0?&B%C*+!" !" !" !"
 !01 4 $3[#@#@ )*<o N N#.!/ !/ Y((** F F 112DEEEF F F F F F F F F F F F F F F"**,,,
 //1110:0K0K'9 + $ 0 1L 1 1-	-e ) Vv 4.q111	66/6RRR#d&9":"::NI2n     4.q111" FI---/BQ/F   I++,A1EEE	&& $": '    !"3"?t"KLL +
 	22 3    )t21555'Q,,)K771-9??AA(?:;"#   
 (1, *K7777 65,,77 ((*****k '&s*   !M55M9<M9fAf3f3Af7f:Af7sSFtSFr&   c                    t          j        |          }t          j        |          }t          j        t          j        | j                  | j                  }t          j        ||          }|                    d          }|	                    |          }t          j
        ||          }	|                    |          }
||	|
fS )a  
        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).

        :param sSF: The scale factor tensor in smem
        :type sSF: cute.Tensor
        :param tSF: The scale factor tensor in tmem
        :type tSF: cute.Tensor

        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        r   )rH   r  make_copy_atomr	   Cp4x32x128bOpr   r   make_s2t_copyr  partition_Sget_s2t_smem_desc_tensorpartition_D)rR   rL  rM  tCsSF_compacttCtSF_compactcopy_atom_s2ttiled_copy_s2tthr_copy_s2ttCsSF_compact_s2t_tCsSF_compact_s2ttCtSF_compact_s2ts              r+   r  zISm100BlockScaledPersistentDenseGemmKernel.mainloop_s2t_copy_and_partition?  s    ( )#..)#.. +!$.11M
 
 !.}mLL%//22 *55mDD#<.
 
 )44]CC02CCCr-   r  tAccr  r   c                    t          j        | j        | j        | j        | j        ||          }t          j        |d         |          }t          j	        ||d                   }|
                    |          }	|	                    |          }
t          j        |d         |          }|	                    |          }t          j        |d         j        | j                  }||
|fS )a  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )NNr   r   N)NNr   r   r   r_  r   r   NNN)NNNr   r   r   r   r   )r   get_tmem_load_opr   r   r   r   rH   flat_divider	   make_tmem_copyr  rR  rT  r  rJ   )rR   r  r]  r  r   r   copy_atom_t2rtAcc_epir3  thr_copy_t2rrB  
gC_mnl_epitTR_gCr5  s                 r+   r  zHSm100BlockScaledPersistentDenseGemmKernel.epilog_tmem_copy_and_partitionj  s    : $4#MLN
 
 #+,
 

 !/8$9:
 
 &//55++H55 %9:H
 

 ))*55%45;T^
 
 x11r-   r3  r6  r  c                     t          j        | j        | j        | j        |          }t          j        ||          }|                    |          }|                    |          }|	                    |          }	||	|fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rC: The partitioned accumulator tensor
        :type tTR_rC: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor
        :type sepi: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rC: The partitioned tensor C (register source)
            - tRS_sC: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )
r   get_smem_store_opr   r   r   rH   make_tiled_copy_Dr  rT  r  )
rR   r3  r6  r  r  copy_atom_r2sr7  thr_copy_r2sr9  r8  s
             r+   r  zHSm100BlockScaledPersistentDenseGemmKernel.epilog_smem_copy_and_partition  s    4 $5M4<
 
 /~NN%//55))"--&&v..vv--r-   atomc                     t          j        |d         |          }|}t          j        |dd          }t          j        |dd          }	t          j        |dt          j        d          ||	          \  }
}||
|fS )a  Make tiledCopy for global memory store, then use it to:
        partition shared memory (source) and global memory (destination) for TMA store version.

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
        :type atom: cute.CopyAtom or cute.TiledCopy
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor

        :return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
            - tma_atom_c: The TMA copy atom
            - bSG_sC: The partitioned shared memory tensor C
            - bSG_gC: The partitioned global tensor C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
        r`  r   rD   r   )rH   rb  r  r   r  rO   )rR   r  rn  r  r   r  gC_epir`  sC_for_tma_partitiongC_for_tma_partitionr:  rA  s               r+   r  zHSm100BlockScaledPersistentDenseGemmKernel.epilog_gmem_copy_and_partition  s    : !9:H
 
 
#/Aq99#/1== !.Q  
 
 66))r-   mma_tiler_mnkr   r   r   r   r   r   r   r   r   c                 R   |d         dk    rdnd}d}t          j        | ||d          }t          j        | ||d          }t          j        | ||
d          }t          j        | ||
d          }t          j        |||d          }t          j        ||          t          j        ||          z   t          j        |	|          z   t          j        |	|          z   }d}t          j        ||          }||z  }||z  ||z   z
  |z  }||||z  |z  z
  |||z   z  z
  ||z  z  z  }|||fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param a_major_mode: Major mode of operand A.
        :type a_major_mode: tcgen05.OperandMajorMode
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param b_major_mode: Major mode of operand B.
        :type b_major_mode: tcgen05.OperandMajorMode
        :param epi_tile: The epilogue tile shape.
        :type epi_tile: cute.Tile
        :param c_dtype: Data type of operand C (output).
        :type c_dtype: type[cutlass.Numeric]
        :param c_layout: Layout enum of operand C.
        :type c_layout: utils.LayoutEnum
        :param sf_dtype: Data type of Scale factor.
        :type sf_dtype: type[cutlass.Numeric]
        :param sf_vec_size: Scale factor vector size.
        :type sf_vec_size: int
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, C stages)
        :rtype: tuple[int, int, int]
        r   r   rD   r  )	r   r   r   r   r   r  r  rH   r?  )r  rs  r   r   r   r   r   r   r   r   r   r   r   r   r   a_smem_layout_stage_oneb_smem_layout_staged_onesfa_smem_layout_staged_onesfb_smem_layout_staged_onec_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesc_bytes_per_stagec_bytesr   s                            r+   r   z9Sm100BlockScaledPersistentDenseGemmKernel._compute_stages  s   d +1-44!  #."@	#
 #
 $/#A	$
 $
  &7%K	&
 &
" &7%K	&
 &
" $/#C	$
 $
  w(?@@ *BCCD +EFFG  +EFFG 	 " .w8PQQ#k1 Y&*<w*FG  	,,|;<-789 ++	- 	- lK77r-   r@   r   c                     t          j        |d          }g |dR }t          | ||||          }t                              ||          }	||	fS )a  Use persistent tile scheduler to compute the grid size for the output tensor C.

        :param c: The output tensor C
        :type c: cute.Tensor
        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: tuple[int, int, int]
        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
        :type cluster_shape_mn: tuple[int, int]
        :param max_active_clusters: Maximum number of active clusters.
        :type max_active_clusters: cutlass.Constexpr

        :return: A tuple containing:
            - tile_sched_params: Parameters for the persistent tile scheduler.
            - grid: Grid shape for kernel launch.
        :rtype: Tuple[MaskedSchedulerParams, tuple[int, int, int]]
        r  r   )rH   r9  r=   rr   rh   )
r  r?   r@   r   rM   re   rA   cluster_shape_mnlrD  r,  s
             r+   rC  z7Sm100BlockScaledPersistentDenseGemmKernel._compute_gridr  sj    2 +0/BB2.2221[!W6G
 
 --.?ATUU $&&r-   ab_dtypec                    d}| t           j        t           j        t           j        hvrd}|dvrd}|t           j        t           j        hvrd}|t           j        k    r|dk    rd}| t           j        t           j        hv r|dk    rd}|t           j        t           j        t           j        t           j        t           j        hvrd}|S )aO  
        Check if the dtypes and sf_vec_size are valid combinations

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param sf_dtype: The data type of the scale factor
        :type sf_dtype: Type[cutlass.Numeric]
        :param sf_vec_size: The vector size of the scale factor
        :type sf_vec_size: int
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]

        :return: True if the dtypes and sf_vec_size are valid, False otherwise
        :rtype: bool
        TF>   rx  r   r   rx  )r   Float4E2M1FN
Float8E5M2Float8E4M3FNFloat8E8M0FNUr   Float16BFloat16)r  r   r   r   r   s        r+   )is_valid_dtypes_and_scale_factor_vec_sizezSSm100BlockScaledPersistentDenseGemmKernel.is_valid_dtypes_and_scale_factor_vec_size  s    ,    
 
 

 H h&&H G173GHHHH w+++r0A0AH*G,@AAAkUWFWFWH OO 
 
 
 Hr-   a_majorb_majorc_majorc                 B    d}| t           j        u r|dk    r|dk    sd}|S )a}  
        Check if the dtypes and sf_vec_size are valid combinations

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param a_major: The major dimension of the A tensor
        :type a_major: str
        :param b_major: The major dimension of the B tensor
        :type b_major: str
        :param c_major: The major dimension of the C tensor
        :type c_major: str

        :return: True if the layouts are valid, False otherwise
        :rtype: bool
        TkF)r   r  )r  r   r  r  r  r   s         r+   is_valid_layoutsz:Sm100BlockScaledPersistentDenseGemmKernel.is_valid_layouts  s2    2 w+++W^^SVHr-   c                 P   d}| d         dvrd}| d         dvrd}|d         | d         dk    rdndz  dk    rd}d }|d         |d         z  d	k    sR|d         dk    sF|d         dk    s:|d         d
k    s.|d         d
k    s" ||d                   r ||d                   sd}|S )a  
        Check if the mma tiler and cluster shape are valid

        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]

        :return: True if the mma tiler and cluster shape are valid, False otherwise
        :rtype: bool
        Tr   )r   r   Fr   r   rD   c                 &    | dk    o| | dz
  z  dk    S )Nr   r   rp   )r   s    r+   <lambda>z`Sm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape.<locals>.<lambda>	  s    !a%">Q!a%[Q,> r-   rx  r   rp   )r   rM   r   is_power_of_2s       r+   $is_valid_mma_tiler_and_cluster_shapezNSm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape  s      ?*,,H?*,,HA|A#'='=!!1EJJH>>Q"21"55::"a''"a''  "Q&&"Q&& =!1!!455 ' =!1!!455 ' Hr-   mnr  lc	                     d}	d }
 |
||dk    | ||f          r( |
||dk    |||f          r |
||dk    | ||f          sd}	|	S )a  
        Check if the tensor alignment is valid

        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the problem shape is valid, False otherwise
        :rtype: bool
        Tc                 D    |rdnd}||         }d| j         z  }||z  dk    S )Nr   r   r   )r   )r{  is_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementss         r+   check_contigous_16B_alignmentzjSm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignment.<locals>.check_contigous_16B_alignment>	  s9    "07QQaN!-n!=&,&;#%(??1DDr-   r  r  Frp   )r  r  r  r  r  r   r  r  r  r   r  s              r+   is_valid_tensor_alignmentzCSm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignment	  s    J 	E 	E 	E .-h3Aq	RR	007c>AqRS9UU	 10'S.1aQR)TT	
 Hr-   c                 
   d}t                               | |||          sd}t                               | ||
||          sd}t                               ||          sd}t                               ||||	| ||
||	  	        sd}|S )aD  
        Check if the gemm can be implemented

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param sf_dtype: The data type of the scale factor tensor
        :type sf_dtype: Type[cutlass.Numeric]
        :param sf_vec_size: The vector size
        :type sf_vec_size: int
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the gemm can be implemented, False otherwise
        :rtype: bool
        TF)r   r  r  r  r  )r  r   r   r   r   rM   r  r  r  r  r  r  r  can_implements                 r+   r  z7Sm100BlockScaledPersistentDenseGemmKernel.can_implementL	  s    b 8bbhW
 
 	" "M8IIgw
 
 	" "M8]]*
 
 	" "M8RRq!Q'7GW
 
 	" "Mr-   )1ri   rj   rk   __doc__rn   r   strrT   r	  rH   r   rl   r   rm   r   	ConstexprcudaCUstreamrb  rF  TiledMmaCopyAtomLayoutComposedLayoutr   Tiler=   	TiledCopyr  r   r  boolr  r  r  r   r   Numericr	   OperandMajorModer   r1  r   rC  r  r  r  r  r  rp   r-   r+   r   r     s=       ' 'R=?=? CHo=?  S/	=?
 =? =? =? =?~L
 L
 L
\ 
XD+D +D K	D
 KD +D D dl+D t{+D %.D D D D XDN 
[l+=l+ }l+ M	l+
 l+ Ml+ l+ ml+ +l+ ml+ +l+ T]+l+ l+ $l+ "[l+  "&!l+" #1#l+$ #1%l+& !%'l+( !%)l+* $DK1Dd$JK+l+, )-l+. 1/l+ l+ l+ [l+\)D[)D [)D 
t~t{DK7	8	)D )D )D )DV=2m=2 k=2 	=2
 )=2 w45=2 
t~t{DK7	8=2 =2 =2 =2~#.#. #. m	#.
 K#. 
t~t{DK7	8#. #. #. #.J-*m-* DM4>12-* 	-*
 )-* K-* 
t}dk4;6	7-* -* -* -*^ r8=r8S#s]+r8 go&r8 .	r8
 go&r8 .r8 )r8 go&r8 "r8 w'r8 r8 r8 r8 
sC}	r8 r8 r8 \r8h  ' 'dl+ ' ; ' "#sC-0	 '
  S/ ' %. ' 
$eCcM&::	; '  '  ' \ 'D 7w'7w'7 7 go&	7
 
7 7 7 \7r w'go&  	
  
   \< &CHo&S/& 
& & & \&P 222 2 	2
 w'2 go&2 2 2 2 
2 2 2 \2h Ew'Ew'E E go&	E
 CHoE  S/E E E E E E E E 
E E E \E E Er-   r   sf_ref_tensorsf_mma_tensorc                     t          j        |dd          }t          j        |dd          }t          j        t          j        |                     D ]'}| j                            |          }| |         ||<   (dS zdConvert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layoutr   r   r   N)rH   r  r   r  r   r  get_hier_coord)r  r  i	mkl_coords       r+   cvt_sf_MKL_to_M32x4xrm_K4xrk_Lr  	  s     $]Aq99M$]Aq99M]49]3344 < <!(77::	#0#;i  < <r-   c                 ^    t          j        | dd          } t          j        | dd          } dS r  )rH   r  )r  s    r+   'cvt_sf_MKL_to_M32x4xrm_K4xrk_L_mma_specr  	  s0     $]Aq99M$]Aq99MMMr-   c           
         d } |||          }| ||f}d}	d}
|  |||	d         |	d         z             |||
          |	d         |	d         |
f}d}d}t          j        |t          j        |t           j        j        t          j        dd	          
          }t          j        |t          j        |t           j        j        t          j        dd	          
          }t          t          |          t          |                     |	                    |d          } |
                    ddd                              d                              | |||                              | |||z            j
        | }|d d d |d d f         }|	                    |d          }t          j        ||dd          \  }}t          j        |||d          }|||fS )Nc                     | |z   dz
  |z  S r   rp   abs     r+   rP   z,create_scale_factor_tensor.<locals>.ceil_div	  s    A	ar-   r   r   r   r   r   )r   rD   r   r   r   r   r   rD   r   r   )min_valmax_val)permute_order	init_typeinit_configT)non_blockingrD   rx  )is_dynamic_layoutassumed_align)r  )cutlass_torchcreate_and_permute_torch_tensortorchfloat32TensorInitTypeRANDOMRandomInitConfigr  r
   r/   permute	unsqueezeexpandreshapecute_tensor_likeconvert_cute_tensor)r  mnr  r   r{  devicerP   sf_k	ref_shapeatom_matom_k	mma_shaperef_permute_ordermma_permute_orderref_f32_torch_tensor_cpucute_f32_torch_tensor_cpucute_f32_torch_tensorref_f32_torch_tensorcute_tensorcute_torch_tensors                       r+   create_scale_factor_tensorr  	  s\         8A{##DBIFF	VAY*++vq	q	I "*  -L'.5!2
 
 
	  	  	  !. M'.5!2
 
 
	! 	! 	! #,---..   688d8SS	 ((Aq11	2	2t[	)	)	B{*	+	+	#		%   82A2qqqA366vD6QQ &3%C!	& & &"K"  3	  K  .???r-   c                    J   e Zd Zdedededededededej        d	ej        d
ej        dej        dedeeef         deeef         dedef dZe	j
        de	j        de	j        de	j        de	j        de	j        de	j        dee	j                 de	j        dej        fd            ZdS )MaskedBatchedMatmulCuteDSLr  r  r  r  r  r  r  r  r   r   alpha_dtyper   r   rM   sm_countr   c                 N   || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        t                              ||	||
|||||||||          s6t!          d| d|	 d| d|
 d| d| d| d| d| d| d| d| d|           t"          j                                        }t)          |                    | j        d         | j        d         z            |          | _        || _        d S )Nz-MaskedBatchedMatmulCuteDSL: Unsupported with z, z,  r   r   )_m_n_k_l_a_major_b_major_c_major	_ab_dtype	_sf_dtype_c_dtype_alpha_dtype_sf_vec_size_mma_tiler_mn_cluster_shape_mnr   r  r4  r   r   HardwareInfominget_max_active_clusters_max_active_clusters_sm_version)rR   r  r  r  r  r  r  r  r  r   r   r  r   r   rM   r  r   hardware_infos                     r+   rT   z#MaskedBatchedMatmulCuteDSL.__init__
  s   & !!'')!18FF
 
 	  K  K  KH  K  KXc  K  Kgn  K  Ks  K  K  DT  K  K  XY  K  K  ]^  K  K  bc  K  K  gh  K  K  ls  K  K  w~  K  K  BI  K  K  
  2244$'11&q)D,B1,EE  	%
 %
! &r-   a_ptrb_ptrsfa_ptrsfb_ptrc_ptrmasked_m_ptrdst_signals_ptr	alpha_ptrcurrent_streamc
                 N   t          j        |t          j        | j        | j        | j        f| j        dk    rdnd                    }
t          j        |t          j        | j        | j        | j        f| j        dk    rdnd                    }t          j        |t          j        | j        | j        | j        f| j	        dk    rdnd                    }d } || j        | j
                  }d}d	}| j         || j        |d
         |d         z             |||          |d
         |d         |f}| j         || j        |d
         |d         z             |||          |d
         |d         |f}d}t          j        |t          j        ||                    }t          j        |t          j        ||                    }t          |           t          |           t          j        |t          j        | j        fd                    }t          j        |d u          r0t          j        |t          j        | j        fd                    nd } t          | j
        | j        | j        | j                  |
|||||||| j        |	
  
         d S )Nr  )r   r   rD   )r   r   rD   )order)r  r  c                     | |z   dz
  |z  S r   rp   r  s     r+   rP   z5MaskedBatchedMatmulCuteDSL.__call__.<locals>.ceil_divf
  s    EAI!##r-   r  r   r   r   r  )r   )r   r   rM   r   )rH   r6  make_ordered_layoutr  r  r  r  r  r  r  r  r  r   r   r   r  r  r  r  )rR   r   r  r  r  r  r  r  r  r  r
  r  r  rP   r  r  r  mma_shape_amma_shape_br  r  r  r  r  s                           r+   rb  z#MaskedBatchedMatmulCuteDSL.__call__B
  sJ    #+$'47+#'=C#7#7iiY  
 
 
 #+$'47+#'=C#7#7iiY  
 
 
 #+$'47+#'=C#7#7iiY  
 
 
	$ 	$ 	$ x!233GHTWfQi&)344HT6""1I1I
 GHTWfQi&)344HT6""1I1I
 /%+'  
 
 

 %+'  
 
 

 	0
;;;/
;;;*+TWJdCCC
 
 
 !)4"788	D/
$GGG   
  		
1)+!3'		
 	
 	
 %	
 	
 	
 	
 	
r-   N)ri   rj   rk   rn   r  r  r{  r   rT   rH   r   rm   r   r  r  rb  rp   r-   r+   r  r  
  s       =&=& =& 	=&
 =& =& =& =& +=& +=& =& [=& =& CHo=&  S/=&  !=&" #=& =& =& =&~ 
Xl
|l
 |l
 	l

 l
 |l
 ll
 "$,/l
 <l
 l
 l
 l
 Xl
 l
 l
r-   r  r  r  r  r  r  r  r  r  r   r   r  r   r   rM   r  r   enable_dst_signalsc                 t   	
 dt           t          t          j                          dt          t          j                 f
	fdt	          j        t          di d dd|dd|d	|d
|ddd	d
d|d|d|d|d|g d           t          j	                    R  	 	 ddt          j
        dt          j
        dt          j
        dt          j
        dt          j
        dt          j
        dt           t          j
                 dt           t          j
                 f	 fd}|S )Ninput_tensorsr&   c           	      F   | 'd t          d          D             \  }}}}}}}}sd }n| \  }	}
}}}}}}|d uk    sJ |	                                |
                                |                                |                                |                                |                                ||                                nd ||                                nd f\  }}}}}}}}t          |t          j        j        d          }t          |t          j        j        d          }t          |t          j        j        d          }t          |t          j        j        d          }t          |t          j        j        d          }t          t          j        |t          j        j        d          }|,t          t          j        |t          j        j        d          nd }|$"t          |t          j        j        d          nd }||||||||gS )Nc                     g | ]}d S )rx  rp   )r   r   s     r+   
<listcomp>zWget_cute_dsl_compiled_masked_gemm_kernel.<locals>.get_cute_pointers.<locals>.<listcomp>
  s    &&&&&&r-   r)   rx  )r  )	r  r  r   rH   AddressSpacegmemr   r   Uint32)r  
a_data_ptr
b_data_ptrsfa_data_ptrsfb_data_ptr
c_data_ptrmasked_m_data_ptrdst_signals_data_ptralpha_data_ptra_tensor_gpub_tensor_gpusfa_tensor_gpusfb_tensor_gpuc_tensor_gpumasked_m_tensor_gpudst_signals_tensor_gpualpha_tensor_gpur   r  r  r  r  r  r  r  r  r  r   r  r   s                            r+   get_cute_pointerszCget_cute_dsl_compiled_masked_gemm_kernel.<locals>.get_cute_pointers
  s      '&U1XX&&&	!$ & ,'+$ 	#&  &*@*LMMMM %%''%%''''))''))%%''#,,..)5 '//111/?/K ))+++QU	!$ "	
 
 
 "	
 
 
 "	
 
 
 "	
 
 
 "	
 
 
  M"	
 
 
 $/ $!& 	     	" )k.E !& 	     	 	
 		
r-   r  r  r  r  r  r  r  r  r   r   r  r   r   rM   r  r   r   r!  r"  r#  r%  r&  r$  r'  c                     |'t          j        ft          	          d          }t          j                    } g  
| |||||||g          |R   |S )Nr  )r{  r  )r  emptyr   r  r  )r   r!  r"  r#  r%  r&  r$  r'  r  r   r(  rF  r  r  r  s            r+   
tensor_apiz<get_cute_dsl_compiled_masked_gemm_kernel.<locals>.tensor_apiW  s      ;Aq	,W55  L '577 	 	
  "" '*$	 	
 	
 	
 	
 	
  r-   rp   r_  )r   r   r  tensorrH   rm   compiler  r  r  rl   )r  r  r  r  r  r  r  r  r   r   r  r   r   rM   r  r   r  r+  r(  rF  s   `` `   ````     ` @@r+   (get_cute_dsl_compiled_masked_gemm_kernelr.  
  s0   (x
U\ 23x
	dl	x
 x
 x
 x
 x
 x
 x
 x
 x
 x
t \" 	
 	
 	
a	
a	
 a	
 a		

 G	
 G	
 G	
 X	
 X	
 G	
 $	
 $	
 &	
 .-	
 X	
  "z!	
& 
	4	 	 '( 	$&&)  F< 0437& &l&l& & 	&
 #\& !&& u|,& #5<0& & & & & & & & & & &P r-   )r?   r  lhsrhsoutr>   r?   c          
         | \  }}|\  }}|}|j         \  }}}|j         \  }}}|dk    r|dz  }|
                    dd          }|
                    dd          }|	t          |j                  }	|
                    dd          }|
                    d	d          }t	          |
          d
k    sJ d|
             t          |j                  \  }}|dk    r|d
k    rt          d           t          di d|d|d|d|dddddddt          |          dt          |          dt          |          d	|dnt          |          d|d|d|d|	dd| | d|du||||||||          S )aL  
    Executes a masked, batched matrix multiplication (GEMM) with scale factors and optional alpha scaling at output.

    Args:
        lhs (Tuple[torch.Tensor, torch.Tensor]): Tuple containing the left-hand side input tensor (A) and its scale factor tensor (SFA).
            - A should be in (m, k, l) order, but physically (l, m, k). For fp4 tensor with 8-bit storage, we expect the shape to be (m, k/2, l).
            - SFA should be in (m32, m4, rm, k4, rk, l) order, but physically (l, rm, rk, m32, m4, k4)
        rhs (Tuple[torch.Tensor, torch.Tensor]): Tuple containing the right-hand side input tensor (B) and its scale factor tensor (SFB).
            - B should be in (n, k, l) order, but physically (l, n, k). For fp4 tensor with 8-bit storage, we expect the shape to be (n, k/2, l).
            - SFB should be in (n32, n4, rn, k4, rk, l) order, but physically (l, rn, rk, n32, n4, k4)
        out (torch.Tensor): Output tensor to store the result, with shape (l, m, n).
        masked_m (torch.Tensor): 1D tensor of shape (l,) specifying the valid row count for each batch (used for masking).
        ab_dtype (str): Data type for A and B matrices. Supported: "float4_e2m1fn", "float8_e4m3fn", "float8_e5m2".
        sf_dtype (str): Data type for scale factors. Supported: "float8_e8m0fnu", "float8_e4m3fn".
        c_dtype (str): Data type for output matrix C. Supported: "float16", "bfloat16", "float32", "float8_e4m3fn", "float8_e5m2".
        sf_vec_size (int): Vector size for scale factors. Typically 16 or 32.
        sm_count (int, optional): Number of SMs to use. Default: max available SMs under the CTA configuration.
        mma_tiler_mn (Tuple[int, int], optional): Shape of the MMA tiler (M, N). Default: (128, 128).
        cluster_shape_mn (Tuple[int, int], optional): Shape of the CTA cluster (ClusterM, ClusterN). Default: (1, 1).
        alpha_dtype (str, optional): Data type for alpha scaling factors.
        alpha (torch.Tensor, optional): Optional 1D tensor of shape (l,) containing per-batch scaling factors. Perform per-batch scaling out = alpha * out.

    Notes:
        - Legends of the input tensors:
            * `l` is the batch size, `m/n` is the number of rows, and `k` is the number of columns.
            * `m/n32`, `m/n4`, `k4` are constant values 32, 4, 4 respectively.
            * `m32 * m4 * rm` should be same as `M`, which is `m` padded up to the nearest multiple of 128.
            * `n32 * n4 * rn` should be same as `N`, which is `n` padded up to the nearest multiple of 128.
            * `k4 * rk` should be same as `K`, which is `k / sf_vec_size` padded up to the nearest multiple of 4.
        - The function applies masking per batch using masked_m.
        - If alpha is provided, each batch output is multiplied by its corresponding alpha value. out = alpha * (A @ B).
        - The result is written to c_tensor.
    float4_e2m1fnrD   r   )r   r   rM   )r   r   Nrh  r  r   zUnsupported kwargs:    z,SM110 is not supported for cute-dsl backend.r  r  r  r  r  r  r  r  r   r   r   r  r   sm_r  )r   r!  r"  r#  r$  r%  r&  r'  rp   )	rJ   popr   r  rY   r   rG   r.  r   )r/  r0  r1  r>   r  r   r   r   r?   r  kwargsa_torch	sfa_torchb_torch	sfb_torchc_torchr  r  r  r  r   r   rM   rh  r  majorminors                              r+   grouped_gemm_nt_maskedr?    sY   b GYGYGmGAq!mGAq!?"" E::nj99Lzz"4f==gn--JJw%%E**]D11Kv;;!<F<<)'.99LE5{{uzzGHHH3   
!
! ! !	
    #8,,, #8,,, "'*** "MDD/@/M/M  K "\ *)   (''''!" 'd22#&   $*5   r-   )Mtypingr   r   r   r   cuda.bindings.driverbindingsdriverr  r   cutlass.cuterH   cutlass.pipeliner  cutlass.torchr  r  cutlass.utilsr   cutlass.utils.blackwell_helpersblackwell_helpersr    cutlass.utils.blockscaled_layoutblockscaled_layoutr   	functoolscutlass._mlirr   cutlass.cute.nvgpur   r	   cutlass.cute.runtimer
   cutlass.cutlass_dslr   r   r   r   r   r   r   r   r   cutlass._mlir.dialectsr   flashinfer.utilsr   flashinfer.api_loggingr   .cutlass.utils.static_persistent_tile_schedulerr   r   r   r   r   r   r   r  r,   r0   r;   r=   rr   r   r   rl   r  r  r  r  cachern   r  r  r  r.  r?  rp   r-   r+   <module>rV     s,  : 0 / / / / / / / / / / / # # # # # # # # #        # # # # # # % % % % % %       5 5 5 5 5 5 5 5 5 < < < < < < < < <            / / / / / / / / , , , , , ,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ( ' ' ' ' ' 3 3 3 3 3 3 1 1 1 1 1 1 G G G G G G R R R R R R R R R R R R ! ! ! ! ! ! ! ! 
 >Bt   6 %  PV     04 3 3 36 3% 3% 3 3 3 3 @D   E % RW    "HA HA HA HA HA HA HA HAVY( Y( Y( Y( Y( Y( Y( Y(xEPA A A A A A A AH> <;<;< < < 
< :;: : : 
:O@ O@ O@dm
 m
 m
 m
 m
 m
 m
 m
` L
L
L L 	L
 L L L 7?#L 7?#L '/"L $w/0L L S/L CHoL L  !L" #L$ %L L L L^  +/"e e e	u|U\)	*e	u|U\)	*e 
e l	e e e e e %,'e sme e e e e er-   