
    )`itG                    f   d dl mZmZmZmZ d dlZd dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmc mZ d dlmc mZ d dlmZmZ d dlmZmZmZmZmZmZmZ ddeddfdZd dl m!Z! d d	l"m#Z# d d
l$m%Z%m&Z&m'Z' ej(        ddddededede)ddf
d            Z*	 ddededdfdZ+	  G d d          Z,dS )    )OptionalTupleTypeUnionN)cpasynctcgen05)PointerInt32Float16BFloat16Float32Float8E4M3FN
Float8E5M2lock_ptrreturnc                 4    t          j        | ||           dS )zE
    arrive a spin lock when the lock_ptr is a multimem address.
    locipN)distributedmultimem_red_relaxed_gpu_add1)r   r   r   s      /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/cute_dsl/gemm_allreduce_two_shot.pyspin_lock_multimem_arriver      s"     -hCBGGGGGG    )nvvm)T)MemOrderKindMemScopeKindAtomicOpKindr   expected_val	reset_valscopec                   |dk    rd}||k    rt          j        t          j                    t          j        | j        t          |                              ||          t          |                              ||          t          j
        t          j        ||	  	        }||k    dS dS |dk    rd}||k    rt          j        t          j                    t          j        | j        t          |                              ||          t          |                              ||          t          j
        t          j        ||	  	        }||k    dS dS dS )z|
    wait on a spin lock until the expected count is reached. Reset flag to reset_val if the expected count is reached.
    gpur   r   )b	mem_order	syncscoper   r   sysN)r   	atomicrmwr   i32r   CASllvm_ptrr
   ir_valuer   ACQUIREr   GPUSYS)r   r    r!   r"   r   r   results          r   spin_lock_atom_cas_acquire_waitr2   *   sc    ~~$$^ !i  ))cb)99%%..32.>>&.&*
 
 
F $$$$$$ 
%$$^ !i  ))cb)99%%..32.>>&.&*
 
 
F $$$$ 
$$r   barrier
barrier_mcc                 z   t           j                                        \  }}}t           j                                        \  }}	}
|||z  z   ||z  |	z  z   }t	          j        ||z   ||           t           j                            t           j        j        j                   t          | |z   |dd||           dS )z'
    barrier for inter-gpu sm-wise
    r   r   r(   )r    r!   r"   r   r   N)
cutearch	block_idxgrid_dimr   multimem_red_release_sys_add1fence_proxy	ProxyKindaliasr2   )r3   r4   	num_ranksr   r   bidxbidybidzbdimxbdimy_pids               r   "sm_wise_inter_gpu_multimem_barrierrF   U   s     y**,,D$i((**OE5!

uu 4
4C-j3.>CBOOOOI$)-3444 $#I%SUW     r   c            #          e Zd ZdZ	 	 dKdeej                 dedee	e	f         dee	e	f         def
d	Z
d
 Zd Zej        d dddfdej        dej        dej        dej        dej        dej        dej        dej        fd            Zej        dej        dej        dej        dej        dej        deej                 dej        dej        dej        d ej        d!eej        ej        df         d"ej        d#ej        dej        dej        dej        dej        f"d$            Zd%ej        d&ej        d'ej        d"ej        deej        ef         d(eej        ej        ej        f         fd)Zd*ej        d+ej        d%ej        d,ej        d(eej        ej        ej        f         f
d-Z d%ej        d.eej        ej        f         d'ej        d"ej        d,ej        d(eej        ej        ej        f         fd/Z!e"dej        d0ee	e	e	f         d1eej                 d2eej                 d"ej        d3eej                 d4ej#        d5e	d6e	ded(ee	e	e	f         fd7            Z$e"dej        d8ee	e	e	f         dee	e	f         dej        d(eej        ee	e	e	f         f         f
d9            Z%e"dej        d:ee	e	e	f         d;e	d(e	fd<            Z&e"	 dLd=eej                 deej                 d3eej                 d>e'd(ef
d?            Z(e"dedee	e	f         dee	e	f         d(efd@            Z)e"	 dLdAe	dBe	dCe	dDe	d=eej                 d3eej                 dEe'dFe'dGe'd>e'd(efdH            Z*e"dededAe	dBe	dee	e	f         d(efdI            Z+e"	 dLd=eej                 deej                 d3eej                 dedee	e	f         dee	e	f         dedAe	dBe	dCe	dDe	dEe'dFe'dGe'd>e'd(ef dJ            Z,dS )MPersistentDenseGemmKernelag	  This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
    :type use_2cta_instrs: bool
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]
    :param use_tma_store: Whether to use Tensor Memory Access (TMA) for storing results
    :type use_tma_store: bool
    :param all_reduce: All-reduce mode, can be "none", "two_shot"
    :type all_reduce: str

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported A/B data types:
        - TFloat32
        - Float16/BFloat16
        - Int8/Uint8
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulator data types:
        - Float32 (for all floating point A/B data types)
        - Float16 (only for fp16 and fp8 A/B data types)
        - Int32 (only for uint8/int8 A/B data types)

    :note: Supported C data types:
        - Float32 (for float32 and int32 accumulator data types)
        - Int32 (for float32 and int32 accumulator data types)
        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
        - Int8/Uint8 (for uint8/int8 accumulator data types)
        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)

    :note: Constraints:
        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
        - MMA tiler N must be 32-256, step 32
        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16

    Example:
        >>> gemm = PersistentDenseGemmKernel(
        ...     acc_dtype=cutlass.Float32,
        ...     use_2cta_instrs=True,
        ...     mma_tiler_mn=(128, 128),
        ...     cluster_shape_mn=(2, 2)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream)
    nonesm_100	acc_dtypeuse_2cta_instrsmma_tiler_mncluster_shape_mnuse_tma_storec                    || _         || _        || _        || _        g |dR | _        || _        |rt          j        j        nt          j        j	        | _
        || _        d| _        d| _        d| _        d| _        d| _        d| _        |dk    r|| _        d| _        dt#          | j        | j        g| j        | j        R           z  | _        d	| _        d| _        d
| _        d| _        t/          j        |          | _        d| _        d	| _        |dk    rHt8          j                                        | _        t8          j                                        | _        dS dS )ar  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
              with cta_group=2 should be used.

        2.  Cluster Shape:
            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.

        3. Output C tensor store mode:
            - use_tma_store: Boolean indicating whether to use Tensor Memory Access (TMA) for storing results.

        :param acc_dtype: Data type of the accumulator.
        :type acc_dtype: type[cutlass.Numeric]
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
        :type use_2cta_instrs: bool
        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mn: Tuple[int, int]
        :param use_tma_store: Use Tensor Memory Access (TMA) or normal store for output C tensor.
        :type use_tma_store: bool
        :param all_reduce: All-reduce mode, can be "none", "two_shot"
        :type all_reduce: str
           )r   rQ                rI   )         	       r   rR   rS   N) rK   rL   rN   rM   	mma_tilerrO   r   CtaGroupTWOONE	cta_group
all_reduce	occupancyepilog_warp_idmma_warp_idtma_warp_idall_reduce_warp_idlenthreads_per_ctacta_sync_bar_idepilog_sync_bar_idtmem_ptr_sync_bar_idall_reduce_sync_bar_idutilsget_smem_capacity_in_bytessmem_capacityr>   rank_idtorchr   get_world_sizeget_rank)selfrK   rL   rM   rN   rO   ra   
sm_versions           r   __init__z"PersistentDenseGemmKernel.__init__   s   P 1:. 0(+<+++* %4MG  9I9M 	 %
 35 (DO&2D#!C   $ (	 %
 %
  
  !"#$%!&'#"=jII".==??DN ,5577DLLL  r   c                     | j         \  }}|| j        rdndz  dvrdS | j        d         | j        rdndz  dk    rdS | j        d         dk    r| j        d         dk    rdS dS )NrR   rQ   @      Fr   rT   T)mma_tile_shape_mnrL   rN   )rt   mma_mmma_ns      r   is_validz"PersistentDenseGemmKernel.is_validH  s    -u4/6aaQ7	II5 #D,@'GqqaHAMM5 #q((T-B1-E-J-J5tr   c                    t          j        | j        | j        | j        | j        | j        | j        dd                   }t          j	        |j
        dg          }d}| j        d         | j        d         ||z  f| _        | j        d         t          j	        |j        j                  z  | j        d         | j        d         f| _        t          j        t          j        g | j        dR           |j        j        f          | _        t          j	        | j        j        d                   | _        t          j	        | j        j        d                   | _        | j        dk    | _        | j        dk    | _        t-          j        | j                  r1t          j        | j        | j        | j        | j                  | _        n| j        dd         | _        |                     || j        | j        | j        | j        | j        | j        | j         | j!        | j        
  
        \  | _"        | _#        | _$        t          j%        || j        | j        | j#                  | _&        t          j'        || j        | j        | j#                  | _(        | j        r+t          j)        | j        | j        | j        | j$                  nd| _*        | +                    || j        | j"                  | _,        dS )a2  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        - Computing tensor memory allocation columns
        NrR   moderT   r   rQ   )-sm100_utilsmake_trivial_tiled_mmaa_dtypea_major_modeb_major_moderK   r`   r\   r6   size	shape_mnkthr_idshapecta_tile_shape_mnktiled_dividemake_layoutrN   cluster_layout_vmnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcastcutlass
const_exprrO   compute_epilogue_tile_shaperL   c_layoutc_dtypeepi_tile_compute_stagesb_dtypero   rb   num_acc_stagenum_ab_stagenum_c_stagemake_smem_layout_aa_smem_layout_stagedmake_smem_layout_bb_smem_layout_stagedmake_smem_layout_epic_smem_layout_staged_compute_num_tmem_alloc_colsnum_tmem_alloc_cols)rt   	tiled_mmamma_inst_shape_kmma_inst_tile_ks       r   _setup_attributesz+PersistentDenseGemmKernel._setup_attributesR  s     6LNNN2A2
 
	  9Y%8sCCCN1N1.
 N19+;+A!B!BBN1N1#
 $(#48t48a8899#%$
 $
  !%	$*B*H*K L L $	$*B*H*K L L/!3/!3 d011 	8'C'$	 DMM !3BQB7DM CGBVBVNLLMLMNC
 C
?D-t/? %0$BNL	%
 %
! %0$BNL	%
 %
! !K, 	    	! $(#D#Dt~t'9$
 $
   r   c                     | S )NrV   xs    r   <lambda>z"PersistentDenseGemmKernel.<lambda>  s    1 r   Nar%   cmax_active_clustersepilogue_opc_mcbarrier_flagbarrier_flag_mcc
                 R    |j          _        |j          _        |j          _        t          j                            |                                           _        t          j                            |                                           _	        t          j                            |           _
        t          j         j         j        k              rt          d j         d j                                                     t          j         j         j         j	         j         j         j        dd                   }
t)          j        |
j        j                  }t          j         j        |
j                  }t)          j         j        d          }t(          j                            ||| j        |
 j        j        |j         t          j        u rt          j         nd          \  }}t          j!         j        |
j                  }t)          j         j"        d          }t(          j        #                    ||| j        |
 j        j        |j         t          j        u rt          j         nd          \  }}t)          j$         j        |          }t)          j$         j        |          }||z   |z   _%        d}d}t          j         j&                  rJt)          j         j'        d          }tQ          j)        tQ          j*                    || j+                  \  }} ,                    | j-         j        |          \   _.        }d _/         j&        rt)          j0         j'        j1                  nd	t(          j2         G  fd
d                      }| _3         4                    |
||||| j&        r|n| j         j         j"         j'         j+         j.        ||||	          5                    | j6        ddgg  j        dR |           dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param a: Input tensor A
        :type a: cute.Tensor
        :param b: Input tensor B
        :type b: cute.Tensor
        :param c: Output tensor C
        :type c: cute.Tensor
        :param c_mc: Output symmetric tensor C_mc, any write or read to a multicast tensor will be broadcasted to all GPUs
        :type c_mc: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
        :type epilogue_op: cutlass.Constexpr
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
        zType must match: z != NrR   )NNNr   )internal_typeNNr      r   c                      e Zd ZU ej        j        ej        j        f         e	d<   ej        j        ej        j        f         e	d<   ej        j        ej        j
        f         e	d<   ej        j        ej        j
        f         e	d<   ej        e	d<   ej        e	d<   ej        j        ej        j        j         f         j        f         e	d<   ej        j        ej        j        j         ej        j        j                  f         j        f         e	d<   ej        j        ej        j        j         ej        j        j                  f         j        f         e	d	<   d
S )9PersistentDenseGemmKernel.__call__.<locals>.SharedStorageab_full_mbar_ptrab_empty_mbar_ptracc_full_mbar_ptracc_empty_mbar_ptrtmem_dealloc_mbar_ptrtmem_holding_bufsCsAsBN)__name__
__module____qualname__r6   structMemRanger   Int64r   __annotations__r   r
   Alignr   buffer_align_bytesr   cosizer   outerr   r   )c_smem_sizert   s   r   SharedStorager   6  s        "k27=$BS3STTTT#{3GM4CT4TUUUU#{3GM4CU4UVVVV $ 4W]DDV5V WWWW#*=000%m+++!$L! '	)    !$L+$+d.G.M"N"NN ')    !$L+$+d.G.M"N"NN ')     r   r   rQ   )gridblockclusterstream)7element_typer   r   r   rm   
LayoutEnumfrom_tensormma_major_moder   r   r   r   r   	TypeErrorr   r   r   rK   r`   r\   r6   r   r   r   cluster_shape_to_tma_atom_ArN   slice_r   nvgpumake_tiled_tma_atom_Ar   r   TFloat32cluster_shape_to_tma_atom_Br   make_tiled_tma_atom_Bsize_in_bytesnum_tma_load_bytesrO   r   r   make_tiled_tma_atomCopyBulkTensorTileS2GOpr   _compute_gridr   tile_sched_paramsr   r   r   r   shared_storagekernellaunchrh   )rt   r   r%   r   r   r   r   r   r   r   r   atom_thr_sizea_opa_smem_layout
tma_atom_atma_tensor_ab_opb_smem_layout
tma_atom_btma_tensor_ba_copy_sizeb_copy_size
tma_atom_ctma_tensor_cepi_smem_layoutr   r   r   s   `                          @r   __call__z"PersistentDenseGemmKernel.__call__  s>   L /0n./n./n!,88;;JJLL!,88;;JJLL(44Q77 dldl:;; 	RPPP$,PPQQQ 	   6LNNN2A2
 
	 	)"2"899 6!9#3
 
 D$=?TUU#':#C#CN$*$%Ngo$E$E  4 $D 
$
 
$
 
L 6!9#3
 
 D$=?TUU#':#C#CN$*$%Ngo$E$E  4 $D 
$
 
$
 
L (}EE(}EE#.#<"M 
d011 	"k$*C_UUO'.'B/11	( ($J (,'9'9t&(=?R(
 (
$ #' =A<NUDK17888TU 	
 
	 	 	 	 	 	 	 	 	 	 
	< , 	 .5LLA$%%%M"#	
 	
$ &'A./d+/Q//	  
 
 
 	r   r   r   mA_mklr   mB_nklr   mC_mnlr   r   r   r   r   r   c                 
;   t           j                                        }t           j                            |          }|| j        k    rUt          j        |           t          j        |           t          j        | j	                  rt          j        |           t          j
        |j        j                  dk    }t           j                                        \  }}}|t          j
        |j        j                  z  }|dk    }t           j                            t           j                                                  }|                    |          }t           j                                        \  }}}t#          j                    }|                    | j                  }|j        }|j        } t/          j        t.          j        j                  }!| j        | j        z   dz
  }"t/          j        t.          j        j        |"          }#t.          j                            |j                                         | j!        |!|#| j"        |          }$t/          j        t.          j        j                  }%tG          | j$                  |rdndz  }&t/          j        t.          j        j        |&          }'t.          j%                            |j&                                         | j'        |%|'|          }(|rc|| j        k    rXd})t           j        (                                5  t           j        )                    ||)           ddd           n# 1 swxY w Y   t           j        *                                 t          j
        | j+                  dk    rt           j        ,                                 | j	        r&|j-        .                    |j/        |j0                  nd}*|j1        .                    |	j/        |	j0                  }+|j2        .                    |
j/        |
j0                  },d}-d}.t          j        | j3        p| j4        p|          r.t          j5        ||d	          }-t          j5        ||d	          }.t          j6        |t          j7        | j8        d
          d          }/t          j6        |t          j7        | j8        d          d          }0t          j6        |t          j7        | j8        d          d          }1t          j
        |/dg          }2|9                    |          }3|3:                    |/          }4|3;                    |0          }5|3<                    |1          }6t          j=        t          j7        |d          j                  }7t          j>        ||d         |7t          j?        |+dd          t          j?        |4dd                    \  }8}9t          j=        t          j7        |d          j                  }:t          j>        ||d         |:t          j?        |,dd          t          j?        |5dd                    \  };}<|@                    |+          }=|A                    |,          }>|B                    | j8        dd                   }?|C                    t          jD        |?| j'                            }@t          j
        | j+                  dk    rt           j        E                                 n+t           j        F                    | jG        | jH                   || j        k    rt"          jI                            |t           j                                        t           j        J                                          }A|AK                                }Bt/          jL        t.          jM        jN        | j!                  }C|BjO        r|BjP        }D|Dd         t          j
        |j        j                  z  |Dd         |Dd         f}E|9d|Ed         d|Ed         f         }F|<d|Ed         d|Ed         f         }G|CQ                                 t          jR        d          }H|CjS        |2k     r|$T                    |C          }Ht          jU        d|2dd          D ]}I|$V                    |C|H           t          jW        ||Fd|CjS        f         |8d|CjX        f         |$Y                    |C          |-           t          jW        ||Gd|CjS        f         |;d|CjX        f         |$Y                    |C          |.           |CZ                                 t          jR        d          }H|CjS        |2k     r|$T                    |C          }H|A[                                 |A\                                }B|BjO        |$]                    |C           || j^        k    rdtG          | j^        g| j$        R           z  }Jt           j        F                    | j_        |J           t           j        `                    | ja        d|           }Kt          jb        |K|@jc                  }Lt"          jI                            |t           j                                        t           j        J                                          }A|AK                                }Bt/          jL        t.          jM        jd        | j!                  }Mt/          jL        t.          jM        jN        | j'                  }N|BjO        r<|BjP        }D|Dd         t          j
        |j        j                  z  |Dd         |Dd         f}E|Lddd|NjX        f         }O|MQ                                 t          jR        d          }P|MjS        |2k     r|r|$e                    |M          }P|r|(V                    |N           |f                    t          jh        ji        d           t          |2          D ]}I|r|$j                    |M|P           t          j
        |=dg          }Qt          jU        |Qd          D ]V}Rdd|R|MjX        f}St          jk        ||O|=|S         |>|S         |O           |f                    t          jh        ji        d           W|$l                    |M           |MZ                                 t          jR        d          }P|MjS        |2k     r|r|$e                    |M          }P|r|(m                    |N           |NZ                                 |A[                                 |A\                                }B|BjO        <|(]                    |N           || j^        k     	rD|| j$        d         k    r't           j        n                    | jo        | |           dtG          | j^        g| j$        R           z  }Jt           j        F                    | j_        |J           t           j        `                    | ja        d|           }Kt          jb        |K|@jc                  }L|}T| p                    |T|L|6||          \  }U}V}Wd}Xd}Yd}Zd}[d}\d}]d}^d}_t          j        | j	                  rYt          jq        |Wj        | jr                  }X| s                    |U|X|T|*          \  }Y}[}\| t                    |T||6||*          \  }}]}^n| t                    |T|U|6||*          \  }Z}X}_t"          jI                            |t           j                                        t           j        J                                          }A|AK                                }Bt/          jL        t.          jM        jd        | j'                  }`d}at          j        | j	                  rut/          j        t.          j        j        dtG          | j$                  z  dtG          | j$                  z            }bt.          ju                            | jv        |b          }a|BjO        r|BjP        }D|Dd         t          j
        |j        j                  z  |Dd         |Dd         f}Ed}cd}dt          j        | j	                  r|^dddg|ER          }cn|_dddddg|ER          }d|Vddddd|`jX        f         }e|(j                    |`           t          j?        |edt          jw        |e                    }et          j        | j	                  r)t          j?        |cdt          jw        |c                    }cn(t          j?        |ddt          jw        |d                    }dt          j
        |ej        dg          }f|Ajx        |fz  }gt          jU        |f          D ]D}h|eddd|hf         }it          jW        |U|i|W           t          j        | j	                  r|Yy                    |W          z                                }j ||j{                    | jr                            }j|[|                    |j           |g|hz   | jv        z  }kt          jW        |Y|[|\ddd|kf                    t           j        }                    t           j        j~        j        t           j        j        j                   dtG          | j$                  z  }lt           j        F                    | j        |l           || j$        d         k    rNt          jW        ||]d|kf         |cd|hf                    |am                                 |aV                                 t           j        F                    | j        |l           |Wz                                }j ||j{                    | jr                            }j|X|                    |j           t          jW        |Z|X|dddd|hf                    Ft           j        (                                5  |(l                    |`           ddd           n# 1 swxY w Y   |`Z                                 t          j        | j        dk              r!t	          |Aj        t          j
        | j+                  z  t           j                                        z             }m|| j$        d         k    rt           j                            dd           t           j        (                                5  |j        |mz   }nt           j                                         t          |n           t           j        }                    t           j        j~        j                   ddd           n# 1 swxY w Y   |A[                                 |A\                                }B|BjO        || j$        d         k    r t           j                            |           dtG          | j$                  z  }lt           j        F                    | j        |l           || j$        d         k    rl|rCt           j                            ||dz             t           j                            |d           t           j                            |K| jo        |           t          j        | j	                  r|a]                                 t          j        | j        dk              rz|| j        d         k    rj| j        }ot	          | j                  }pt           j                                        }qt"          jI                            |t           j                                        t           j        J                                          }A|AK                                }Bd|j        j        z  }r| j8        d         |rz  }stG          | j                  t           j        j        |sz  z  }tt          j=        |t|sf|sdf           }ut          j=        d|rf|rdf           }vt          j        t           j                                        |j                  }wt          j        |w|u|v          }x|x9                    || j        d         dz  z
            }y|BjO        r|BjP        }Dt	          |Aj        t          j
        | j+                  z  t           j                                        z             }m|Dd         t          j
        |j        j                  z  |Dd         |Dd         f}E|| j        d         k    rYt           j        (                                5  |j        |mz   }nt5          j        |n|pdd!"           ddd           n# 1 swxY w Y   t           j        F                    | j        dtG          | j                  z             t          j6        |t          j7        | j8        d          d          }z|3<                    |z          }{|{d#ddg|ER          }|| j8        d         t          j
        |j        j                  z  }}t;          |}| j        z            }~t          j        |||~| j8        d         f          }t          j7        |d#|odff          }|y                    |          }|j        \  }}}t          j        |          D ]}t          j        |          D ]h}|d||f         j        }d$\  }}}}t          j        | jr        tB          k              rt5          j        |          \  }}}}nt          j        | jr        tF          k              rt5          j        |          \  }}}}nt          j        | jr        tJ          k              rt5          j        |          \  }}}}n{t          j        | jr        tN          k              rt5          j        |          \  }}}}n=t          j        | jr        tR          k              rt5          j        |          \  }}}}t5          j        |||||           j|A[                                 |A\                                }B|BjO        t           j        F                    | j        dtG          | j                  z             t          j
        |Aj        j                  t          j
        | j+                  z  }|| j        d         k    rdt           j        (                                5  t]          |j        |z   |j        |z   | j                   ddd           dS # 1 swxY w Y   dS dS dS dS )%zW
        GPU device kernel performing the Persistent batched GEMM computation.
        rR   r   rQ   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)r   r   r   r   r  r[   N)swizzle)
mcast_mode)Nr   NNNN)r   NNr   rS   r   )r   r   Nr   )r   Nr   r   )
barrier_idnumber_of_threads)unroll)tma_bar_ptr
mcast_mask   )	alignmentptr_to_buffer_holding_addrFT)unroll_full)
is_two_cta)r   r   )spacetwo_shot)readrz   )strider$   )r    r!   r"   NN)r   r   r   r   )r6   r7   warp_idxmake_warp_uniformre   r   prefetch_descriptorr   r   rO   r   r   r   r8   block_idx_in_clusterget_flat_coord
thread_idxrm   SmemAllocatorallocater   r   r   pipelineCooperativeGroupAgentThreadr   r   PipelineTmaUmmacreater   data_ptrr   r   rg   rc   PipelineUmmaAsyncr   r   	elect_onembarrier_initmbarrier_init_fencerN   cluster_arrive_relaxedr   
get_tensorr   innerr   r   r   r   create_tma_multicast_mask
local_tiler   r\   	get_slicepartition_Apartition_Bpartition_Cr   tma_partitiongroup_modesmake_fragment_Amake_fragment_Bpartition_shape_Cmake_fragment_Cappendcluster_waitr3   ri   rh   StaticPersistentTileSchedulerr9   initial_work_tile_infomake_pipeline_statePipelineUserTypeProduceris_valid_tiletile_idxreset_countBooleancountproducer_try_acquirerangeproducer_acquirecopyindexproducer_get_barrieradvanceadvance_to_next_workget_current_workproducer_tailrd   rk   retrieve_tmem_ptrrK   make_tensorlayoutConsumerconsumer_try_waitsetr   Field
ACCUMULATEconsumer_waitgemmconsumer_releaseproducer_commit
alloc_tmemr   epilog_tmem_copy_and_partitionmake_fragmentr   epilog_smem_copy_and_partitionepilog_gmem_copy_and_partitionPipelineTmaStorer   ranknum_tiles_executedretileloadtostorer;   r<   async_sharedSharedSpace
shared_ctarj   ra   r
   _current_work_linear_idxcp_async_bulk_wait_groupiteratorfence_acq_rel_gpur   r=   relinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmemrf   rp   r>   lane_idxr   width	WARP_SIZEmake_copy_atomr   CopyUniversalOpmake_tiled_copy_tvr   spin_lock_atom_cas_relaxed_waitrl   intzipped_dividepartition_Srange_constexprr   multimem_ld_reduce_8xf16r   multimem_ld_reduce_4xf32r   multimem_ld_reduce_8xbf16r   multimem_ld_reduce_16xe4m3r   multimem_ld_reduce_16xe5m2multimem_st_4xb32paramsproblem_layout_ncluster_mnlrF   )rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rL   r?   r@   rA   mma_tile_coord_vis_leader_ctacta_rank_in_clusterblock_in_cluster_coord_vmnktidxrD   smemstorager   r   ab_pipeline_producer_groupnum_tma_producerab_pipeline_consumer_groupab_pipelineacc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_groupacc_pipelinenum_tmem_dealloc_threadsr   r   r   a_full_mcast_maskb_full_mcast_maskgA_mklgB_nklgC_mnl
k_tile_cntthr_mmatCgAtCgBtCgCa_cta_layouttAsAtAgAb_cta_layouttBsBtBgBtCrAtCrB	acc_shapetCtAcc_fake
tile_sched	work_tileab_producer_statecur_tile_coordmma_tile_coord_mnl
tAgA_slice
tBgB_slicepeek_ab_empty_statusk_tiletmem_ptr_read_threadstmem_ptrtCtAcc_baseab_consumer_stateacc_producer_statetCtAccpeek_ab_full_statusnum_kblocks
kblock_idxkblock_coordepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcctTR_rCtiled_copy_r2s	simt_atomtRS_rCtRS_sCbSG_sCbSG_gC_partitionedtTR_gC_partitionedacc_consumer_state
c_pipelinec_producer_groupbSG_gCtTR_gCtTR_tAccsubtile_cntnum_prev_subtilessubtile_idxtTR_tAcc_mnacc_vecc_bufferepilog_threadstile_idflagrp   r>   lane_idatom_val
atom_thr_n
atom_thr_m
thr_layout
val_layoutcopy_atom_loadtiled_copy_fakethr_copy_fakegC_mctCgC_mctCgC_mc_slicecta_mma_tile_mm_local_ranktCgC_mc_slice_partitionedtCgC_mc_local_rankfrgC_mcatomloop_mloop_nijmc_ptrr   yzwlast_flag_idxs                                                                                                                                                r   r   z PersistentDenseGemmKernel.kernels  sB   0 9%%''9..x88
 t''''
333'
333!$"455 8+J777)I$4$:;;q@  9..00dD$)I,<,B"C"CC(A-"i99I**,,
 
 ':&H&H'
 '
# Y))++
a
 "$$-- 344 ' ="3 &.%>x~?T%U%U"043HH1L%-%>N!#3&
 &
" .55#4==??(55,/ 6 
 
 '/&?@U&V&V##&t':#;#; 'AAa$
  '/&?N!#;'
 '
#  188#5>>@@)66/ 9 
 
  	4++++-(Y((**  I++-/G                 		%%''' 9T*++a//I,,... !GJ!!$*4H4N "     	 Z"" &0D0J # 
 
 Z"" &0D0J # 
 
 ! doSSOTT 	 ' A#%@Q! ! ! !( A#%@Q! ! ! DK@@BT
 
 DK@@BT
 
 DK@@BT
 
 YvQC000

 %%&677""6**""6**""6** 'K+_==C
 

 *'*RA&&T1a((
 

d 'K+_==C
 

 *'*RA&&T1a((
 

d ((,,((,,//rr0BCC	//K	4#566
 
 9T*++a//I""$$$$I/4CW     t''' <CC!49#6#6#8#8$):L:L:N:N J #99;;I ( <)2D4E! ! ) A:!*!3"1%93C3I)J)JJ"1%"1%&" "-a0$8J18MN
 "-a0$8J18MN

 "--///'.q'9'9$$*Z77+6+K+K), ,( &mAz1QGGG  F00)+?  
 I""D*;*A#BCd$5$;<=$/$D$DEV$W$W#4    I""D*;*A#BCd$5$;<=$/$D$DEV$W$W#4    &--///+2?1+=+=((.;;/:/O/O-0 0, //111&7799	C ) A:L %%&7888
 t''' %'d.>-UAT-U-U)V)V$V!I4"7     y22+; 3  H *8[5GHHK
 <CC!49#6#6#8#8$):L:L:N:N J #99;;I ( <)2D4E! ! "*!=)2D4F" " ) U:!*!3"1%93C3I)J)JJ"1%"1%&" %dD$8J8P%QR "--///&-oa&8&8#$*Z77M7*5*G*G)+ +' ! F 112DEEE
 gm6>>>
 $J// % %F$ H#11-/B  
 '+iA3&?&?&?*1-QU*V*V*V J JJ $ $ * 1 7	,L !I ) & $\ 2 $\ 2 &   &MM'-*BDIIII $445FGGG &--///*1/!*<*<'(.;;( 2=2O2O 13 3/ ! E 001CDDD"**,,,
 //111&7799	k ) U:t &&'9::: d&&& 4.q111	$$,$. %    %'d.>-UAT-U-U)V)V$V!I4"7     y22+; 3  H *8[5GHHK
 H
 33+tX 	
 F!NIFFF!%!%!$"455 +HNDLII151T1T"FHb2 2. 77j$" 	&& 77ndHb 	& <CC!49#6#6#8#8$):L:L:N:N J #99;;I!)!=)2D4F" " J!$"455 
#+#<N)T0111T0111$ $ 
 &6==#/#3 >  

 ) X:!*!3"1%93C3I)J)JJ"1%"1%&" %d&899 /    0	 FF 0      0 	F )4tT3E3KL **+=>>>+Ha89L9LMM%d&899 L!-fa69J9JKKFF!-fa69J9JKKF
 #iaSAAA$.$AK$O!#*=#=#= @ @K #+D$k+J"KKInk8DDD)$*<== 9 #1"7"7"A"A"F"F"H"H"-+gjj.F.F"G"GW---
 %6$CtGW#W	*""D$h#?@   	-- I/<"&)"7"B .    *,c$2E.F.F)F	))'+'>.< *    $t':1'=== I * &h'7 8 &k': ;   '66888&77999	))'+'>.< *     #+--//"-+gjj.F.F"G"GW---
 	%vvtT46U/V    Y((** F F 112DEEEF F F F F F F F F F F F F F F"**,,, %do&CDD M#";)D$9::;)88::; G
  4#6q#999	::15:III!Y0022 M M#2#;g#ED I779995d;;; I11$)2E2KLLL	M M M M M M M M M M M M M M M //111&7799	q ) X:z 4.q111	66/6RRR#d&9":"::NI2n     4.q111" FI---/BQ/F   I++,A1EEE	&&d6? '    !$"455 +((***
 do;<< A	421555
 ,!$.11	),,.."@GG%ty':':'<'<di>P>P>R>R 
 '==??	 $"3"99!^A.(:
 !899I':5
 "-,j!_  
 "-q(mXqMRRR
!%!4J..00$2C" " #'"9"J
# # !0 9 94215::! !  - M>%.%7N#";)D$9::;)88::; G 'q)TYy7G7M-N-NN&q)&q)*&  4#:1#===!Y0022  #/#87#BD'G $9QV                  I%%#'#>*,s43J/K/K*K &   
 !ODNODD* E
 &11%88G$+\1a,UBT,U,U$VM &*^A%6$)!(.; ; &N $'~'F#G#GL040B%dnQ6G'H1 1- *.1L7A,3O* *&
 ,778JKKG+2=(D&&$4V<< N N!(!8!@!@ N NA%,T1aZ%8%AF)3JAq!Q&1$,'2IJJ "-8-Q$*." ."
1a ")!3DLG4K!L!L "-8-Q$*." ."
1a ")!3DLH4L!M!M "-8-R$*." ."
1a ")!3DLL4P!Q!Q "-8-S$*." ."
1a ")!3DLJ4N!O!O "-8-S$*." ."
1a (9&!Q1MMMM/N2 33555 * ; ; = =I[  - M>^ 	!!#:&(3t/F+G+G&G "   
 !%	%A! !Id344!5 t6q999,,..  :(1MA+4}D N                   yA	 A	55t :9ss   $!MMMPAQQAQQAQTA,AVVAVVAVe.$AffAf"f%Af"t=(Au2u2Au6u9Au6r  tAccr  r   c                    t          j        | j        | j        | j        | j        ||          }t          j        |d         |          }t          j	        ||d                   }|
                    |          }	|	                    |          }
t          j        |d         |          }|	                    |          }t          j        |d         j        | j                  }||
|fS )a  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )r  r   r   N)NNr   r   r   r  r   r   NNNNNNr   r   r   r   r   )r   get_tmem_load_opr   r   r   rK   r6   flat_divider   make_tmem_copyr-  ry  partition_Dr[  r   )rt   r  r  r  r   rL   copy_atom_t2rtAcc_epir  thr_copy_t2rr  
gC_mnl_epir  r  s                 r   rZ  z8PersistentDenseGemmKernel.epilog_tmem_copy_and_partition  s    : $4#MLN
 
 #+,
 

 !/8$9:
 
 &//55++H55 %9:H
 

 ))*55%45;T^
 
 x11r   r  r  r   c                     t          j        | j        | j        | j        |          }t          j        ||          }|                    |          }|                    |          }|	                    |          }	||	|fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rC: The partitioned accumulator tensor
        :type tTR_rC: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor
        :type sepi: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rC: The partitioned tensor C (register source)
            - tRS_sC: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )
r   get_smem_store_opr   r   rK   r6   make_tiled_copy_Dr-  r  ra  )
rt   r  r  r  r   copy_atom_r2sr  thr_copy_r2sr  r  s
             r   r\  z8PersistentDenseGemmKernel.epilog_smem_copy_and_partition  s    4 $5M4<
 
 /~NN%//55))"--&&v..vv--r   r  c                 B   t          j        |d         |          }t          j        | j                  r`|}t          j        |dd          }t          j        |dd          }	t          j        |dt          j        d          ||	          \  }
}||
|fS |}|	                    |          }|
                    |          }t          j        |d         j        | j                  }t          j        t           j                                        | j                  }|||fS )aK  Make tiledCopy for global memory store, then use it to:
        - partition register array (source) and global memory (destination) for none TMA store version;
        - partition shared memory (source) and global memory (destination) for TMA store version.

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
        :type atom: cute.CopyAtom or cute.TiledCopy
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor

        :return: A tuple containing either:
            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
                - tma_atom_c: The TMA copy atom
                - bSG_sC: The partitioned shared memory tensor C
                - bSG_gC: The partitioned global tensor C
            - For non-TMA store: (simt_atom, tTR_rC, tTR_gC) where:
                - simt_atom: The SIMT copy atom
                - tTR_rC: The register tensor C
                - tTR_gC: The partitioned global tensor C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
        r  r   rR   rQ   r  )r6   r  r   r   rO   r2  r   r1  r   r-  r  r[  r   r   rs  r   rt  )rt   r  r  r  r   r   gC_epir   sC_for_tma_partitiongC_for_tma_partitionr  r  r  r  r  r  r  s                    r   r]  z8PersistentDenseGemmKernel.epilog_gmem_copy_and_partition-  s(   F !9:H
 
 d011 	-J#'#3B1#=#= #'#3FAq#A#A  %2 ##$$ NFF vv--!N)33D99L!--f55F'89? F +DJ,F,F,H,H$,WWIff,,r   mma_tiler_mnkr   r   r   r   ro   rb   c
                    d}
|	rdnd}t          j        | ||d          }t          j        | ||d          }|	rt          j        |||d          nd}t	          j        ||          t	          j        ||          z   }d}|	rt	          j        ||          nd}||z  }||z  ||z   z
  |z  }|	r||||z  |z  z
  |||z   z  z
  ||z  z  z  }|
||fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param epi_tile: The epilogue tile shape.
        :type epi_tile: cute.Tile
        :param c_dtype: Data type of operand C (output).
        :type c_dtype: type[cutlass.Numeric]
        :param c_layout: Layout enum of operand C.
        :type c_layout: utils.LayoutEnum
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int
        :param use_tma_store: Whether TMA store is enabled.
        :type use_tma_store: bool

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, C stages)
        :rtype: tuple[int, int, int]
        rR   r   rQ   Nr   )r   r   r   r   r6   r   )r   r  r   r   r   r   r   ro   rb   rO   r   r   a_smem_layout_stage_oneb_smem_layout_staged_onec_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesc_bytes_per_stagec_bytesr   s                       r   r   z)PersistentDenseGemmKernel._compute_stagesm  sw   R  )/aaa #."@	#
 #
 $/#A	$
 $
  K,	    	! "/,
 
w(@AAB " Dw(@AAA 	
 $k1 Y&*<w*FG   	100<?@1G;<= //	1 1K
 lK77r   r   c                     t          j        |d          }t          j        | |          }|d         j        }g |dR }t	          j        ||          }t          j                            ||          }	||	fS )a
  Use persistent tile scheduler to compute the grid size for the output tensor C.

        :param c: The output tensor C
        :type c: cute.Tensor
        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: tuple[int, int, int]
        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
        :type cluster_shape_mn: tuple[int, int]
        :param max_active_clusters: Maximum number of active clusters.
        :type max_active_clusters: cutlass.Constexpr

        :return: A tuple containing:
            - tile_sched_params: Parameters for the persistent tile scheduler.
            - grid: Grid shape for kernel launch.
        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
        r   )tiler)r   r  rQ   )r6   r   rx  r   rm   PersistentTileSchedulerParamsr9  get_grid_shape)
r   r   rN   r   c_shapegcnum_ctas_mnlcluster_shape_mnlr   r   s
             r   r   z'PersistentDenseGemmKernel._compute_grid  s    . +0/BB1111282.222!?+
 
 2AA2
 
 !$&&r   r\   r   c                     |                      |dd                   }|                     t          j        ||                    }t	          j        |          }|S )a  
        Compute the number of tensor memory allocation columns.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler: The shape (M, N, K) of the MMA tile.
        :type mma_tiler: tuple[int, int, int]
        :param num_acc_stage: The stage of the accumulator tensor.
        :type num_acc_stage: int

        :return: The number of tensor memory allocation columns.
        :rtype: int
        NrR   )r5  r6  r6   r7  rm   get_num_tmem_alloc_cols)r   r\   r   r  r  r   s         r   r   z6PersistentDenseGemmKernel._compute_num_tmem_alloc_cols  sU    & //	"1">>	//I}0U0UVV#;KHH""r   ab_dtypera   c           	      B   d}| t           j        t           j        t           j        t           j        t           j        t           j        t           j        hvrd}|t           j        t           j        t           j	        hvs_|t           j        k    r%| t           j        t           j        t           j        hvs*|t           j	        k    r| t           j        t           j        hvrd}|t           j        k    r\|t           j        t           j        t           j        t           j        t           j        t           j	        t           j        t           j        hvs|t           j        k    r|t           j        t           j        hvsV|t           j	        k    rH|t           j        t           j        t           j        t           j	        t           j        t           j        hvrd}t          j
        |dk    o:|t           j        t           j        t           j        t           j        t           j        hv          rd}|S )a  
        Check if the dtypes are valid

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]

        :return: True if the dtypes are valid, False otherwise
        :rtype: bool
        TFrI   )r   r   r   r   Uint8Int8r   r   r   r
   r   )r  rK   r   ra   r~   s        r   is_valid_dtypesz)PersistentDenseGemmKernel.is_valid_dtypes  s   ( OML 
 
 
 HgowNNNGO++OW%97;MNO OGM)) ===H(( $"	
 
 GO++  
 GM))   H&   $"

 

 	 Hr   c                 F   d}| s
|d         dv s| r
|d         dv sd}|d         t          ddd          vrd}|d         | rd	ndz  dk    rd}d
 }|d         |d         z  dk    s:|d         dk    s.|d         dk    s" ||d                   r ||d                   sd}|S )a	  
        Check if the mma tiler and cluster shape are valid

        :param use_2cta_instrs: Whether to use 2 CTA groups
        :type use_2cta_instrs: bool
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]

        :return: True if the mma tiler and cluster shape are valid, False otherwise
        :rtype: bool
        Tr   rx   )rz      FrQ   r[   i  rR   c                 &    | dk    o| | dz
  z  dk    S )Nr   rQ   rV   r   s    r   r   zPPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape.<locals>.<lambda>  s    !a%">Q!a%[Q,> r   r  )rD  )rL   rM   rN   r~   is_power_of_2s        r   $is_valid_mma_tiler_and_cluster_shapez>PersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shapee  s   &  !	%1!_	%A%A &B$0Oz$A$AH?%C"4"444HA"=!!A>!CCH>>Q"21"55::"a''"a'' =!1!!455 ( =!1!!455 ( Hr   mnkla_majorb_majorc_majorc
                     d}
d } |||dk    | ||f          r( |||dk    |||f          r |||dk    | ||f          sd}
|	dk    r| dz  dk    r|dz  dk    rd}
|
S )	a  
        Check if the tensor alignment is valid

        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the problem shape is valid, False otherwise
        :rtype: bool
        Tc                 D    |rdnd}||         }d| j         z  }||z  dk    S )Nr   rQ   rz   )rq  )dtypeis_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementss         r   check_contigous_16B_alignmentzZPersistentDenseGemmKernel.is_valid_tensor_alignment.<locals>.check_contigous_16B_alignment  s9    "07QQaN!-n!=&,&;#%(??1DDr   r"  r#  FrI   rz   r   rV   )r"  r#  r$  r%  r  r   r&  r'  r(  ra   r~   r1  s               r   is_valid_tensor_alignmentz3PersistentDenseGemmKernel.is_valid_tensor_alignment  s    L 	E 	E 	E .-h3Aq	RR	007c>AqRS9UU	 10'S.1aQR)TT	
 HAGqLLQW\\Hr   c                 |    d}|d         | rdndz  |d         f}|s ||d         z  dk    r||d         z  dk    sd}|S )ah  
        Check if the epilogue store option is valid

        :param use_2cta_instrs: Whether to use 2 CTA groups
        :type use_2cta_instrs: bool
        :param use_tma_store: Whether to use TMA store
        :type use_tma_store: bool
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]

        :return: True if the epilogue store option is valid, False otherwise
        :rtype: bool
        Tr   rR   rQ   FrV   )rL   rO   r"  r#  rM   r~   cta_tile_shape_mns          r   is_valid_epilog_store_optionz6PersistentDenseGemmKernel.is_valid_epilog_store_option  so    4  O_ ;!<O
  	!)!,,11a:KA:N6NRS6S6S r   c                 F   d}t                               | ||          sd}t                               |||          sd}t                               |||	|
| |||||
  
        sd}t                               |||||          sd}t          j                    dvr|dk    rd}|S )a  
        Check if the gemm can be implemented

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param use_2cta_instrs: Whether to use 2 CTA groups
        :type use_2cta_instrs: bool
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]
        :param use_tma_store: Whether to use TMA store
        :type use_tma_store: bool
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the gemm can be implemented, False otherwise
        :rtype: bool
        TF)rR   rT   rY   rI   )rH   r  r!  r2  r5  distrr   )r  rK   r   rL   rM   rN   rO   r"  r#  r$  r%  r&  r'  r(  ra   can_implements                   r   r8  z'PersistentDenseGemmKernel.can_implement  s    j (889gVV 	"!M(MM\+;
 
 	" "M(BBq!Q'7GWj
 
 	" "M(EE]Aq,
 
 	" "M  	11jF6J6J!Mr   )rI   rJ   )rI   )-r   r   r   __doc__r   r   Numericboolr   rw  rv   r~   r   r6   jitTensor	Constexprr   r   TiledMmaCopyAtomr   LayoutComposedLayoutr   Tilerm   r  r
   rA  	TiledCopyrZ  r\  r]  staticmethodr   r   r   r   strr  r!  r2  r5  r8  rV   r   r   rH   rH      s-       3 3x X8 X8(X8 X8 CHo	X8
  S/X8 X8 X8 X8 X8t  f
 f
 f
P 
X *5 $('+u u;u ;u ;	u
 %.u &u ku ku u u u Xup 
[S=S MS 	S
 MS S T]+S S "[S #1S #1S $DK1Dd$JKS )S !>S &S  k!S" k#S$ %S S S [Sj=2m=2 k=2 	=2
 )=2 w45=2 
t~t{DK7	8=2 =2 =2 =2~#.#. #. m	#.
 K#. 
t~t{DK7	8#. #. #. #.J>-m>- DM4>12>- 	>-
 )>- K>- 
t}dk4;6	7>- >- >- >-@ `8=`8S#s]+`8 go&`8 go&	`8
 )`8 go&`8 "`8 `8 `8 `8 
sC}	`8 `8 `8 \`8D "';"'!#sC-0"'  S/"' %.	"'
 
u2E#sC-4HH	I"' "' "' \"'H #=#c3'# # 
	# # # \#0 
 !	T Tw'T(T go&T 	T
 
T T T \Tl ((CHo(  S/( 
	( ( ( \(T  !6 666 6 	6
 w'6 go&6 6 6 6 6 
6 6 6 \6p """ " 	"
 CHo" 
" " " \"H   !J Jw'J(J go&J 	J
 CHoJ  S/J J J J J J J J J J  
!J J J \J J Jr   rH   r  )-typingr   r   r   r   rq   torch.distributedr   r7  r   cutlass.cuter6   cutlass.utilsrm   cutlass.pipeliner  cutlass.utils.blackwell_helpersblackwell_helpersr   cutlass.utils.distributedcutlass.cute.nvgpur   r   cutlass.cute.typingr	   r
   r   r   r   r   r   r   cutlass._mlir.dialectsr   cutlass.cutlass_dslr   cutlass._mlir.dialects.nvvmr   r   r   r<  rF  r2   rF   rH   rV   r   r   <module>rT     s   / / / / / / / / / / / /                           # # # # # # 5 5 5 5 5 5 5 5 5 / / / / / / / / / / / / / / / / /                 H H Ht H H H H ( ' ' ' ' ' ! ! ! ! ! !           	' ' '' ' 	'
 ' 
' ' ' 
'V DH ")	   &M`A A A A A A A A A Ar   