
    `iG                        d dl mZmZmZmZ d dlmc mZ d dl	Z	d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmc mZ d dlmc mZ d dlmZ d dlmc mZ d dlZdZ	  G d d          Z G d d	          ZdS )
    )OptionalTypeTupleUnionN)cpasynctcgen05)from_dlpackg+eG?c            '          e Zd ZdZdeej                 dedee	e	f         dee	e	f         fdZ
d Zej        d fd	ej        d
ej        dej        dej        dej        dej        dej        dej        fd            Zej        dej        dej        dej        dej        dej        deej                 deej                 dej        dej        dej        dej        dej        deej        ej        df         d eej        ej        df         d!ej        d"ej        d#ej        dej        dej        f&d$            Zd%ej        d&ej        d'ej        d(ej        d!ej        d"ej        deej        ef         d)eej         ej        ej        ej        f         fd*Z!d+ej         d,ej        d-ej        d.ej        d%ej        d/ej        d0ej        d)eej         ej        ej        ej        ej        ej        f         fd1Z"d%ej        d2eej        ej         f         d3eej        ej         f         d'ej        d(ej        d!ej        d"ej        d/ej        d0ej        d)eej        ej        ej        ej        ej        ej        f         fd4Z#e$dej        d5ee	e	e	f         d6eej                 d7eej                 d!ej        d"ej        d8eej                 d9ej%        d:eej                 d;ej%        d<e	d=e	d)ee	e	e	f         fd>            Z&e$dej        d?ee	e	e	f         dee	e	f         dej        d)eej        ee	e	e	f         f         f
d@            Z'e$dej        dAee	e	e	f         dBe	d)e	fdC            Z(dS )DPersistentDenseGemmKernela  This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
    :type use_2cta_instrs: bool
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]
    :note: This kernel always uses Tensor Memory Access (TMA) for storing results.

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported A/B data types:
        - TFloat32
        - Float16/BFloat16
        - Int8/Uint8
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulator data types:
        - Float32 (for all floating point A/B data types)
        - Float16 (only for fp16 and fp8 A/B data types)
        - Int32 (only for uint8/int8 A/B data types)

    :note: Supported C data types:
        - Float32 (for float32 and int32 accumulator data types)
        - Int32 (for float32 and int32 accumulator data types)
        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
        - Int8/Uint8 (for uint8/int8 accumulator data types)
        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)

    :note: Constraints:
        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
        - MMA tiler N must be 32-256, step 32
        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16

    Example:
        >>> gemm = PersistentDenseGemmKernel(
        ...     acc_dtype=cutlass.Float32,
        ...     use_2cta_instrs=True,
        ...     mma_tiler_mn=(128, 128),
        ...     cluster_shape_mn=(2, 2)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream)
    	acc_dtypeuse_2cta_instrsmma_tiler_mncluster_shape_mnc                    || _         |d         dk    | _        || _        g |dR | _        |rt          j        j        nt          j        j        | _        d| _	        d| _
        d| _        d| _        dt          | j        | j        g| j
        R           z  | _        d| _        d| _        d| _        t%          j        d	          | _        d
S )aY  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
              with cta_group=2 should be used.

        2.  Cluster Shape:
            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.

        3. Output C tensor store mode:
            - TMA store is always enabled for output tensors.

        :param acc_dtype: Data type of the accumulator.
        :type acc_dtype: type[cutlass.Numeric]
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
        :type use_2cta_instrs: bool
        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mn: Tuple[int, int]
        r         )r   r                   r   sm_100N)r   r   r   	mma_tilerr   CtaGroupTWOONE	cta_group	occupancyepilog_warp_idmma_warp_idtma_warp_idlenthreads_per_ctacta_sync_bar_idepilog_sync_bar_idtmem_ptr_sync_bar_idutilsget_smem_capacity_in_bytessmem_capacityselfr   r   r   r   s        /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/gemm_swiglu/dense_gemm_persistent_swiglu.py__init__z"PersistentDenseGemmKernel.__init__   s    B 1:+A#5 0+<+++1@Z)--gFVFZ
 !C)94;K(bdNa(b(b$c$cc "#$%!"=hGG    c                 H   t          j        | j        | j        | j        | j        | j        | j        dd                   }t          j	        |j
        dg          }d}| j        d         | j        d         ||z  f| _        | j        d         | j        d         dz  | j        d         f| _        | j        d         t          j	        |j        j                  z  | j        d         | j        d         f| _        | j        d         t          j	        |j        j                  z  | j        d         | j        d         f| _        t          j        t          j        g | j        dR           |j        j        f          | _        t          j	        | j        j        d                   | _        t          j	        | j        j        d                   | _        | j        dk    | _        | j        dk    | _        t          j        | j        | j        | j        | j                  | _        t          j        | j        | j        | j        | j                  | _        |                      || j        | j        | j!        | j        | j        | j        | j        | j        | j        | j"        | j#                  \  | _$        | _%        | _&        | _'        t          j(        || j        | j        | j%                  | _)        t          j*        || j        | j!        | j%                  | _+        t          j,        | j        | j        | j        | j&                  | _-        t          j,        | j        | j        | j        | j'                  | _.        | /                    || j        | j$                  | _0        dS )a2  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        - Computing tensor memory allocation columns
        Nr   moder   r   r   )1sm100_utilsmake_trivial_tiled_mmaa_dtypea_major_modeb_major_moder   r   r   cutesize	shape_mnkmma_tiler_cthr_idshapecta_tile_shape_mnkcta_tile_shape_mnk_ctiled_dividemake_layoutr   cluster_layout_vmnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcastcompute_epilogue_tile_shaper   ab12_layout
ab12_dtypeepi_tilec_layoutc_dtype
epi_tile_c_compute_stagesb_dtyper)   r   num_acc_stagenum_ab_stagenum_ab12_stagenum_c_stagemake_smem_layout_aa_smem_layout_stagedmake_smem_layout_bb_smem_layout_stagedmake_smem_layout_epiab12_smem_layout_stagedc_smem_layout_staged_compute_num_tmem_alloc_colsnum_tmem_alloc_cols)r+   	tiled_mmamma_inst_shape_kmma_inst_tile_ks       r,   _setup_attributesz+PersistentDenseGemmKernel._setup_attributes   sm     6LNNN2A2
 
	  9Y%8sCCCN1N1.
 N1N1"N1
 N19+;+A!B!BBN1N1#
 Q49Y-=-C#D#DDQQ%
! $(#48t48a8899#%$
 $
  !%	$*B*H*K L L $	$*B*H*K L L/!3/!3 $?# O	
 
 &A% ML	
 
 X\WkWkNLLMOOLMNX
 X
TD-t/BDDT  %0$BNL	%
 %
! %0$BNL	%
 %
! (3'GOM	(
 (
$ %0$DLMO	%
 %
! $(#D#DYPTP^`d`r#s#s   r.   c                 :    | dt          j        |  d          z   z  S Nr   Tmathexpxs    r,   <lambda>z"PersistentDenseGemmKernel.<lambda>L      1DHaR<N<N8N3O r.   abab12calphamax_active_clustersstreamepilogue_opc	                 8
     |j          _        |j          _        |j          _        |j          _        t
          j                            |                                           _	        t
          j                            |                                           _
        t
          j                            |           _        |j          _        t
          j                            |           _        t          j         j         j        k              rt          d j         d j                                                     t#          j         j         j	         j
         j         j         j        dd                   }	t-          j        |	j        j                  }
t#          j         j        |	j                  }t-          j         j        d          }t,          j                            ||| j        |	 j         j        |j         t          j!        u rt          j"        nd          \  }}t#          j#         j        |	j                  }t-          j         j$        d          }t,          j        %                    ||| j        |	 j         j        |j         t          j!        u rt          j"        nd          \  }}t-          j&         j        |          }t-          j&         j        |          }||z   |
z   _'        t-          j(        t-          j)        |j                   j*                  }t-          j(        t-          j)        |j                   j+                  }t-          j         j,        d          }t-          j         j-        d          }t]          j/        t]          j0                    |||          \  }}t]          j/        t]          j0                    |||          \  }} 1                    | j2         j        |          \   _3        }d _4        t-          j5         j,        j6                  t-          j5         j-        j6                   t,          j7         G   fd	d
                      }| _8         9                    |	|||||||| j          j         j$         j,         j-         j*         j+         j3        ||          :                    | j;        ddgg  j        dR  j8        &                                |           dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel asynchronously

        :param a: Input tensor A
        :type a: cute.Tensor
        :param b: Input tensor B
        :type b: cute.Tensor
        :param ab12: Output tensor AB12 (full GEMM result)
        :type ab12: cute.Tensor
        :param c: Output tensor C (SwiGLU result)
        :type c: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
        :type epilogue_op: cutlass.Constexpr
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
        zType must match: z != Nr   )NNNr   )internal_typeNNr      c                   :   e Zd ZU ej        j        ej        j        f         e	d<   ej        j        ej        j        f         e	d<   ej        j        ej        j
        f         e	d<   ej        j        ej        j
        f         e	d<   ej        e	d<   ej        e	d<   ej        j        ej        j        j         f         j        f         e	d<   ej        j        ej        j        j        f         j        f         e	d<   ej        j        ej        j        j         ej        j        j                  f         j        f         e	d	<   ej        j        ej        j        j         ej        j        j                  f         j        f         e	d
<   dS )9PersistentDenseGemmKernel.__call__.<locals>.SharedStorageab_full_mbar_ptrab_empty_mbar_ptracc_full_mbar_ptracc_empty_mbar_ptrtmem_dealloc_mbar_ptrtmem_holding_bufsAB12sCsAsBN)__name__
__module____qualname__r7   structMemRangecutlassInt64rP   __annotations__rO   Int32AlignrH   buffer_align_bytesrK   r4   cosizerT   outerrN   rV   )ab12_smem_sizec_smem_sizer+   s   r,   SharedStoragerv     s        "k27=$BS3STTTT#{3GM4CT4TUUUU#{3GM4CU4UVVVV $ 4W]DDV5V WWWW#*=000%m+++;$$O"$ '	)    !$L! '	)    !$T\;4;t?X?^3_3_%_`')   
 !$T\;4;t?X?^3_3_%_`')     r.   r   r   )gridblockclustersmemro   )<element_typer4   rN   rH   rK   r'   
LayoutEnumfrom_tensormma_major_moder5   r6   rG   rJ   r   
const_expr	TypeErrorr_   r2   r3   r   r   r   r7   r8   r;   r<   cluster_shape_to_tma_atom_Ar   slice_rT   nvgpumake_tiled_tma_atom_ArA   Float32TFloat32cluster_shape_to_tma_atom_BrV   make_tiled_tma_atom_Bsize_in_bytesnum_tma_load_bytescompositionmake_identity_layoutrI   rL   rX   rY   r   make_tiled_tma_atomCopyBulkTensorTileS2GOp_compute_gridr=   tile_sched_paramsr   r   r   r   shared_storagekernellaunchr#   )!r+   ri   rj   rk   rl   rm   rn   ro   rp   r\   atom_thr_sizea_opa_smem_layout
tma_atom_atma_tensor_ab_opb_smem_layout
tma_atom_btma_tensor_ba_copy_sizeb_copy_sizeab12_cta_v_layoutc_cta_v_layoutepi_smem_layoutepi_smem_layout_ctma_atom_ab12tma_tensor_ab12
tma_atom_ctma_tensor_cr   r   r   r   s!   `                              @@r,   __call__z"PersistentDenseGemmKernel.__call__B  s   J /0n./n151B./n!,88;;JJLL!,88;;JJLL +77==./n(44Q77 dldl:;; 	RPPP$,PPQQQ 	   6LNNN2A2
 
	 	)"2"899 6t7LiN^__D$=?TUU#':#C#CN$*/0~/P/P7++VZ $D $
 $
 
L 6t7LiN^__D$=?TUU#':#C#CN$*/0~/P/P7++VZ $D $
 $
 
L (}EE(}EE#.#<"M !,T-Ftz-R-RTXTabb)$*CAG*L*Ldo^^+d&BOTT K(A?SS)0)D+--	*
 *
& $+#>+--	$
 $
 
L (,'9'9$@WY]Yn  qD  (E  (E$"&T%A%GHHk$";"ABB 
#	 #	 #	 #	 #	 #	 #	 #	 #	 #	 #	 
#	J , 	$%%(%MO"'	
 	
( &'A./d+/Q//$2244  
 
 
 	r.   r\   r   mA_mklr   mB_nklr   r   	mAB12_mnlmC_mnlrA   rT   rV   rX   NrY   rI   rL   r   c                 &/   t           j                                        }t           j                            |          }|| j        k    rPt          j        |           t          j        |           t          j        |           t          j        |           t          j        |j        j	                  dk    }t           j        
                                \  }}}|t          j        |j        j	                  z  }|dk    }t           j                            t           j                                                  }|
                    |          }t           j                                        \  }}}t          j                    }|                    | j                  } | j        }!| j        }"t)          j        t(          j        j                  }#| j        | j        z   dz
  }$t)          j        t(          j        j        |$          }%t(          j                            | j                                        | j        |#|%| j        |
          }&t)          j        t(          j        j                  }'tA          | j!                  |rdndz  }(t)          j        t(          j        j        |(          })t(          j"                            | j#                                        | j$        |'|)|
          }*|rc|| j        k    rXd}+t           j        %                                5  t           j        &                    |!|+           ddd           n# 1 swxY w Y   t           j        '                                 t          j        | j(                  dk    rt           j        )                                 | j*        +                    |j,        |j-                  },| j.        +                    |j,        |j-                  }-| j/        +                    |j,        |j-                  }.| j0        +                    |j,        |j-                  }/d}0d}1tc          j2        | j3        p| j4        p|          r.t          j5        |
|d	          }0t          j5        |
|d	          }1t          j6        |t          j7        | j8        d
          d          }2t          j6        |t          j7        | j8        d          d          }3t          j6        |t          j7        | j8        d          d          }4t          j6        |	t          j7        | j9        d          d          }5t          j        |2dg          }6|:                    |          }7|7;                    |2          }8|7<                    |3          }9|7=                    |4          }:|7=                    |5          };t           j                                        \  }}}t          j>        t          j7        |
d          j	                  }<t          j?        ||d         |<t          j@        |.dd          t          j@        |8dd                    \  }=}>t          j>        t          j7        |
d          j	                  }?t          j?        ||d         |?t          j@        |/dd          t          j@        |9dd                    \  }@}A|A                    |.          }B|B                    |/          }C|C                    | j8        dd                   }D|D                    t          jE        |D| j$                            }Et          j        | j(                  dk    rt           j        F                                 n+t           j        G                    | jH        | jI                   || j        k    rt          jJ                            |t           j        
                                t           j        K                                          }F|FL                                }Gt)          jM        t(          jN        jO        | j                  }H|GjP        r|GjQ        }I|Id         t          j        |j        j	                  z  |Id         |Id         f}J|>d|Jd         d|Jd         f         }K|Ad|Jd         d|Jd         f         }L|HR                                 tc          jS        d          }M|HjT        |6k     r|&U                    |H          }Mtc          jV        d|6dd          D ]}N|&W                    |H|M           t          jX        ||Kd|HjT        f         |=d|HjY        f         |&Z                    |H          |0           t          jX        ||Ld|HjT        f         |@d|HjY        f         |&Z                    |H          |1           |H[                                 tc          jS        d          }M|HjT        |6k     r|&U                    |H          }M|F\                                 |F]                                }G|GjP        |&^                    |H           || j_        k    rdtA          | j_        g| j!        R           z  }Ot           j        G                    | j`        |O           t           j        a                    | jb        d|"          }Pt          jc        |P|Ejd                  }Qt          jJ                            |t           j        
                                t           j        K                                          }F|FL                                }Gt)          jM        t(          jN        je        | j                  }Rt)          jM        t(          jN        jO        | j$                  }S|GjP        rE|GjQ        }I|Id         t          j        |j        j	                  z  |Id         |Id         f}J|Qddd|SjY        f         }T|RR                                 tc          jS        d          }U|RjT        |6k     r|r|&f                    |R          }U|r|*W                    |S           |g                    t          ji        jj        d           tc          jV        d|6dd          D ]}N|r|&k                    |R|U           t          j        |Bdg          }Vtc          jV        |Vd          D ]V}Wdd|W|RjY        f}Xt          jl        ||T|B|X         |C|X         |T           |g                    t          ji        jj        d           W|&m                    |R           |R[                                 tc          jS        d          }U|RjT        |6k     r|r|&f                    |R          }U|r|*n                    |S           |S[                                 |F\                                 |F]                                }G|GjP        E|*^                    |S           || j_        k     	rh|| j!        d         k    r't           j        o                    | jp        |"|           dtA          | j_        g| j!        R           z  }Ot           j        G                    | j`        |O           t           j        a                    | jb        d|"          }Pt          jc        |P|Ejd                  }Q|}Y| q                    |Y|Q|:|;|||          \  }Z}[}\}]d}^d}_d}`d}ad}bd}cd}dd}ed}fd}gd}ht          jr        |\j	        | js                  }^t          jr        |\j	        | js                  }it          jr        |\j	        | jt                  }_| u                    |Z|^|i|_|Y|,|-          \  }`}a}j}b}c}d| v                    |Y|||:|;|||,|-	  	        \  }}}e}f}g}ht          jJ                            |t           j        
                                t           j        K                                          }F|FL                                }Gt)          jM        t(          jN        je        | j$                  }kt)          j        t(          j        j        dtA          | j!                  z            }lt(          jw                            | jx        |l          }m|GjP        r|GjQ        }I|Id         t          j        |j        j	                  z  |Id         |Id         f}J|gdddg|JR          }n|hdddg|JR          }o|[ddddd|kjY        f         }p|*k                    |k           t          j@        |pdt          jy        |p                    }pt          j@        |ndt          jy        |n                    }nt          j@        |odt          jy        |o                    }ot          j        |pj	        dg          }q|Fjz        |qz  }rtc          jV        d|qd          D ]
}s|pddd|sf         }t|pddd|sdz   f         }ut          jX        |Z|u|]           t          jX        |Z|t|\           |`{                    |\          |                                }v|`{                    |]          |                                }w|v|z  }v|w|z  }wdt           j}        ~                    d|wz  t          z  d          z                       | jb                  }xt          jr        |xj	        tb          j                  }y|y                    |x           tc          j        t          j        |yj	                            D ]*}zt           j                            |y|z                   |y|z<   +|y|                                }{|{|wz  }{|v|{z                      | jt                  }||v                    | js                  }v|w                    | js                  }w|a                    |v           |j                    |w           |b                    ||           |r|sz   | jx        z  }}|r|sz   dz   | jx        z  }~|r|sdz  z   | j        z  }t          jX        |`|a|cddd|}f                    t          jX        |`|j|cddd|~f                    t          jX        |`|b|dddd|f                    t           j                            t           j        j        j        t           j        j        j                   dtA          | j!                  z  }t           j        G                    | j        |           || j!        d         k    rt          jX        ||ed|}f         |nd|sf                    t          jX        ||ed|~f         |nd|sdz   f                    t          jX        ||fd|f         |od|sdz  f                    |mn                                 |mW                                 t           j        G                    | j        |           t           j        %                                5  |*m                    |k           ddd           n# 1 swxY w Y   |k[                                 |F\                                 |F]                                }G|GjP        || j!        d         k    r t           j                            |           dtA          | j!                  z  }t           j        G                    | j        |           || j!        d         k    rl|rCt           j                            |!|dz             t           j                            |!d           t           j                            |P| jp        |           |m^                                 dS dS )zW
        GPU device kernel performing the Persistent batched GEMM computation.
        r   r   r   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)r   r   r   r   r   r   N)swizzle)
mcast_mode)Nr   NNNN)r   NNrs   r   r0   )r   r   Nr   )r   Nr   r   )
barrier_idnumber_of_threads)unroll)tma_bar_ptr
mcast_mask   )	alignmentptr_to_buffer_holding_addrFT)unroll_full)
is_two_cta)r   r   )space)r7   archwarp_idxmake_warp_uniformr!   r   prefetch_descriptorr8   r;   r<   	block_idxblock_idx_in_clusterget_flat_coord
thread_idxr'   SmemAllocatorallocater   r{   r|   pipelineCooperativeGroupAgentThreadrB   rC   PipelineTmaUmmacreaterw   data_ptrrP   r   r"   r   PipelineUmmaAsyncry   rO   	elect_onembarrier_initmbarrier_init_fencer   cluster_arrive_relaxedr}   
get_tensorr   innerr~   r   r   r   r   rD   rE   create_tma_multicast_mask
local_tiler   r   r:   	get_slicepartition_Apartition_Bpartition_Cr@   tma_partitiongroup_modesmake_fragment_Amake_fragment_Bpartition_shape_Cmake_fragment_Cappendcluster_waitbarrierr$   r#   StaticPersistentTileSchedulergrid_diminitial_work_tile_infomake_pipeline_statePipelineUserTypeProduceris_valid_tiletile_idxreset_countBooleancountproducer_try_acquirerangeproducer_acquirecopyindexproducer_get_barrieradvanceadvance_to_next_workget_current_workproducer_tailr    r&   retrieve_tmem_ptrr   make_tensorlayoutConsumerconsumer_try_waitsetr   Field
ACCUMULATEconsumer_waitgemmconsumer_releaseproducer_commit
alloc_tmemr[   epilog_tmem_copy_and_partitionmake_rmem_tensorrH   rK   epilog_smem_copy_and_partitionepilog_gmem_copy_and_partitionPipelineTmaStorerQ   ranknum_tiles_executedretileloadrc   exp2LOG2_Etor   storerange_constexpr
rcp_approxrR   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctar%   relinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmem)r+   r\   r   r   r   r   r   r   r   r   rA   rT   rV   rX   rY   rI   rL   r   rp   rm   r   r   bidxbidybidzmma_tile_coord_vis_leader_ctacta_rank_in_clusterblock_in_cluster_coord_vmnktidx_r   storager{   r|   ab_pipeline_producer_groupnum_tma_producerab_pipeline_consumer_groupab_pipelineacc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_groupacc_pipelinenum_tmem_dealloc_threadsr}   r~   r   r   a_full_mcast_maskb_full_mcast_maskgA_mklgB_nkl	gAB12_mnlgC_mnlk_block_cntthr_mmatCgAtCgBtCgAB12tCgCa_cta_layouttAsAtAgAb_cta_layouttBsBtBgBtCrAtCrB	acc_shapetCtAcc_fake
tile_sched	work_tileab_producer_statecur_tile_coordmma_tile_coord_mnl
tAgA_slice
tBgB_slicepeek_ab_empty_statusk_blocktmem_ptr_read_threadstmem_ptrtCtAcc_baseab_consumer_stateacc_producer_statetCtAccpeek_ab_full_statusnum_kphases
kphase_idxkphase_coordepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcc	tTR_rAcc1	tTR_rAB12tTR_rCtiled_copy_r2s	tRS_rAB12tRS_rC	tRS_sAB12tRS_sC	bSG_sAB12bSG_sCbSG_gAB12_partitionedbSG_gC_partitionedtTR_rAB12_1tRS_rAB12_1acc_consumer_statec_producer_group
c_pipeline	bSG_gAB12bSG_gCtTR_tAccsubtile_cntnum_prev_subtilessubtile_idxtTR_tAcc_mntTR_tAcc_mn1acc_vec0acc_vec1gate_rcpresigate	acc_vec_cab12_buffer0ab12_buffer1c_bufferepilog_threadss                                                                                                                                    r,   r   z PersistentDenseGemmKernel.kernel  sZ   4 9%%''9..x88
 t''''
333'
333'666'
333)I$4$:;;q@  9..00dD$)I,<,B"C"CC(A-"i99$):X:X:Z:Z[[&9&H&HI\&]&]#Y))++
a
 "$$-- 344 ' ="3 &.%>x~?T%U%U"043HH1L%-%>x~?TVf%g%g".55#4==??(55,/ 6 
 
 '/&?@U&V&V##&t':#;#;O?ZqqYZ#[ &.&?@UWo&p&p#188#5>>@@)66/ 9 
 
  	]4++++-(Y((** ] ]I++,AC[\\\] ] ] ] ] ] ] ] ] ] ] ] ] ] ]	%%''' 9T*++a//I,,... (()@)FPgPm(nnZ""#7#=G[Ga"bbZ""#7#=G[Ga"bbZ""#7#=G[Ga"bb
 ! doSSOTT 	B ' ABUWr  @A  !B  !B  !B ' ABUWr  @A  !B  !B  !B T^_)U)UWijjK88
 
 OIt{4>?/[/[]opp	K(/::
 

 iaS111
 %%&677""6**""6**%%i00""6**Y))++
a '4G(Y(Y(_`` *'*RA&&T1a((
 

d '4G(Y(Y(_`` *'*RA&&T1a((
 

d ((,,((,,//rr0BCC	//ItGY0Z0Z[[
 9T*++a//I""$$$$I)=QUQefff t''' <CCDUW[W`WjWjWlWlnrnw  oA  oA  oC  oC  D  DJ"99;;I ( <X=V=_aear s s) 7:!*!3"1%93C3I)J)JJ"1%"1%&" "4);A)>FXYZF["\]
!4);A)>FXYZF["\]
 "--///'.q'9'9$$*[88+6+K+KL]+^+^(  '}QQqIII c cG001BDXYYY I""D*;*A#BCd$5$;<=$/$D$DEV$W$W#4    I""D*;*A#BCd$5$;<=$/$D$DEV$W$W#4    &--///+2?1+=+=((.<</:/O/OPa/b/b,
 //111&7799	o ) 7:x %%&7888
 t''' %'d.>-UAT-U-U)V)V$V!I4"7     y22+; 3  H *8[5GHHK
 <CCDUW[W`WjWjWlWlnrnw  oA  oA  oC  oC  D  DJ"99;;I ( <X=V=_aear s s!)!=h>W>`bfbt!u!u) O:!*!3"1%93C3I)J)JJ"1%"1%&" %dD$8J8P%QR "--///&-oa&8&8#$*[88]8*5*G*GHY*Z*Z'
 ! F 112DEEE
 gm6>>>
  '}QQqIII !c !cG$ H#112CEXYYY '+iA3&?&?&?*1-QU*V*V*V J JJ $ $ * 1 7	,L !I ) & $\ 2 $\ 2 &   &MM'-*BDIIII $445FGGG &--///*1/!*<*<'(.<<( c2=2O2OPa2b2b/
 ! E 001CDDD"**,,,
 //111&7799	_ ) O:h &&'9::: d&&& 4.q111	$$,$. %    %'d.>-UAT-U-U)V)V$V!I4"7     y22+; 3  H *8[5GHHK
 H 33  IF!NIFIFIF$(!!%-hndoNNI/PPK*8>4<HHFPTPsPs	;%QSQ QMNI{FIv 33
 
%"  <CCDUW[W`WjWjWlWlnrnw  oA  oA  oC  oC  D  DJ"99;;I!)!=h>W>`bfbt!u!u  (8%S,---    "299./ :  J
 ) V:!*!3"1%93C3I)J)JJ"1%"1%&" 2 ,	 	 , ,	  )$dD$HZH`)ab **+=>>>+Ha89L9LMM ,Y49Y;O;OPP	)&!TYv5F5FGG #iaSAAA$.$AK$O!#*=K#C#C \ \K #+D$k+J"KK#+T4{Q,O#PLInlIFFFInk8DDD  .44X>>CCEEH-44Y??DDFFH'%/H'%/H !"DINN2=63I4$P$P PTTUYUcddH/PPCIIh'''$4TYsy5I5IJJ > >!%!5!5c!f!=!=A88::D(?D!)D 4 4T\ B BI (}}T_==H (}}T_==HOOH---%%h///LL+++ %6$CtGZ#ZL$5$Ca$G4K^#^L 1K14D DHXXHI&!!4t\"BC   I&#!4t\"BC   I&dD(;<   I))	+8"i3> *    &(#d.A*B*B%BNI%%#'#:*8 &     4#6q#999	)%t\&:;%t[&9:  
 	)%t\&:;%t[1_&=>   	&"D(#34"D+*:#;<   #22444"33555I%%#'#:*8 &     Y((** F F 112DEEEF F F F F F F F F F F F F F F"**,,,
 //111&7799	m ) V:v 4.q111	66/6RRR#d&9":"::NI)@Tbccc4.q111" FI--.CEX[\E\]]]I++,A1EEE	&&x1IVe&fff $$&&&&&a '&s*   !MMMXAYYAYYAYrD  tAccrT  rU  returnc                    t          j        | j        | j        | j        | j        ||          }t          j        |d         |          }	t          j	        ||	d                   }
|

                    |          }|                    |	          }t          j        |d         |          }|                    |          }t          j        |d         j        | j                  }t          j        |d         j        | j                  }|
|||fS )az  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param gAB12_mnl: The global tensor AB12
        :type gAB12_mnl: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param epi_tile_c: The epilogue tiler for C
        :type epi_tile_c: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )NNr   r   N)NNr   r   r   r  r   r   NNN)NNNr   r   r   r   r   )r2   get_tmem_load_opr=   rG   rH   r   r7   flat_divider   make_tmem_copyr   partition_Spartition_Dr&  r<   )r+   rD  r  rT  rU  rI   rL   r   copy_atom_t2rtAcc_epirz  thr_copy_t2rr  gAB12_mnl_epi	tTR_gAB12r|  r}  s                    r,   r%  z8PersistentDenseGemmKernel.epilog_tmem_copy_and_partition  s   F $4#ON
 
 #+,
 

 !/xH]?^__%//55++H55 (3Y)Z\dee ,,];;	(3T)U)[]a]kll))4U*V*\^b^lmm	x9<<r.   rz  r~  r  r  r}   r~   c                    t          j        | j        | j        | j        |          }t          j        ||          }	|	                    |          }
|
                    |          }|
                    |          }|		                    |          }|		                    |          }|		                    |          }|	|||||fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rAB12: The partitioned accumulator tensor for AB12
        :type tTR_rAB12: cute.Tensor
        :param tTR_rAB12_1: The partitioned accumulator tensor for AB12 (second tile)
        :type tTR_rAB12_1: cute.Tensor
        :param tTR_rC: The partitioned accumulator tensor for C
        :type tTR_rC: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param sAB12: The shared memory tensor for AB12
        :type sAB12: cute.Tensor
        :param sC: The shared memory tensor for C
        :type sC: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rAB12, tRS_rAB12_1, tRS_rC, tRS_sAB12, tRS_sC) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rAB12: The partitioned tensor AB12 (register source)
            - tRS_sAB12: The partitioned tensor AB12 (smem destination)
            - tRS_rC: The partitioned tensor C (register source)
            - tRS_sC: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]
        )
r2   get_smem_store_oprG   rH   r   r7   make_tiled_copy_Dr   r  r,  )r+   rz  r~  r  r  rD  r}   r~   copy_atom_r2sr  thr_copy_r2sr  r  r  r  r  s                   r,   r'  z8PersistentDenseGemmKernel.epilog_smem_copy_and_partition  s    H $5d6FY]Ygiwxx/~NN%//55 ,,U33	))"--")))44	$++K88&&v..y+vy&PPr.   atom1atom2c
                    t          j        |d         |          }
t          j        |d         |          }|}|}t          j        |dd          }t          j        |	dd          }t          j        |
dd          }t          j        |dd          }t          j        |dt          j        d          ||          \  }}t          j        |dt          j        d          ||          \  }}||||||fS )a%  Make tiledCopy for global memory store, then use it to:
        - partition register array (source) and global memory (destination) for none TMA store version;
        - partition shared memory (source) and global memory (destination) for TMA store version.

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param atom1: The copy_atom for AB12 TMA store
        :type atom1: cute.CopyAtom or cute.TiledCopy
        :param atom2: The copy_atom for C TMA store
        :type atom2: cute.CopyAtom or cute.TiledCopy
        :param gAB12_mnl: The global tensor AB12
        :type gAB12_mnl: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler for AB12
        :type epi_tile: cute.Tile
        :param epi_tile_c: The epilogue tiler for C
        :type epi_tile_c: cute.Tile
        :param sAB12: The shared memory tensor for AB12
        :type sAB12: cute.Tensor
        :param sC: The shared memory tensor for C
        :type sC: cute.Tensor

        :return: A tuple containing:
            - tma_atom_ab12: The TMA copy atom for AB12
            - tma_atom_c: The TMA copy atom for C
            - bSG_sAB12: The partitioned shared memory tensor AB12
            - bSG_sC: The partitioned shared memory tensor C
            - bSG_gAB12: The partitioned global tensor AB12
            - bSG_gC: The partitioned global tensor C
        :rtype: Tuple[cute.CopyAtom, cute.CopyAtom, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]
        r  r   r   r   )r7   r  r   r   r   r@   )r+   rD  r  r  rT  rU  rI   rL   r}   r~   	gAB12_epigC_epir   r   sAB12_for_tma_partitionsC_for_tma_partitiongAB12_for_tma_partitiongC_for_tma_partitionr  r  r  r  s                         r,   r(  z8PersistentDenseGemmKernel.epilog_gmem_copy_and_partition  s   Z $Y/U%VX`aa	!&)O"PR\]]
"&"25!Q"?"?#/Aq99"&"29a"C"C#/1==  '4Q## 
  
	9 !.Q  
 
 j)VYNNr.   mma_tiler_mnkr4   rN   rH   rG   rK   rJ   r)   r   c                    d}d}d}t          j        | ||d          }t          j        | ||d          }t          j        |||d          }t          j        ||	|d          }t	          j        ||          t	          j        ||          z   }d}t	          j        ||          }||z  }t	          j        ||          }||z  }|
|z  ||z   |z   z
  |z  }||z  |z  }|||z  ||z  z   z  }||z   ||z  z   }||||fS )a  Computes the number of stages for A/B/AB12/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param epi_tile: The epilogue tile shape for AB12.
        :type epi_tile: cute.Tile
        :param epi_tile_c: The epilogue tile shape for C.
        :type epi_tile_c: cute.Tile
        :param ab12_dtype: Data type of operand AB12 (full GEMM output).
        :type ab12_dtype: type[cutlass.Numeric]
        :param ab12_layout: Layout enum of operand AB12.
        :type ab12_layout: utils.LayoutEnum
        :param c_dtype: Data type of operand C (SwiGLU output).
        :type c_dtype: type[cutlass.Numeric]
        :param c_layout: Layout enum of operand C.
        :type c_layout: utils.LayoutEnum
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, AB12 stages, C stages)
        :rtype: tuple[int, int, int, int]
        r   r   r   rt   )r2   rS   rU   rW   r7   r   )r\   r  r4   rN   rI   rL   rH   rG   rK   rJ   r)   r   rO   rQ   rR   a_smem_layout_stage_oneb_smem_layout_staged_oneab12_smem_layout_staged_onec_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesab12_bytes_per_stage
ab12_bytesc_bytes_per_stagec_bytesrP   total_ab_smemtotal_output_smemtotal_smem_useds                                r,   rM   z)PersistentDenseGemmKernel._compute_stagesV  s   ^   #."@	#
 #
 $/#A	$
 $
  '2&F	'
 '
# $/#C	$
 $
  "/9PQQTXTfgn  qI  UJ  UJ  J!#1*>YZZ)N:
 .w8PQQ#k1 &26H:6UX_6_`eww "$66E%)=)NQbepQp)pq'*;;iJ\>\\lNKGGr.   r=   c                     t          j        |d          }t          j        | |          }|d         j        }g |dR }t	          j        ||          }t          j                            ||          }	||	fS )a  Use persistent tile scheduler to compute the grid size for the output tensor AB12.

        :param ab12: The output tensor AB12
        :type ab12: cute.Tensor
        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: tuple[int, int, int]
        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
        :type cluster_shape_mn: tuple[int, int]
        :param max_active_clusters: Maximum number of active clusters.
        :type max_active_clusters: cutlass.Constexpr

        :return: A tuple containing:
            - tile_sched_params: Parameters for the persistent tile scheduler.
            - grid: Grid shape for kernel launch.
        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
        rs   )tiler)r   r   r   )r7   r   zipped_divider<   r'   PersistentTileSchedulerParamsr  get_grid_shape)
rk   r=   r   rn   
ab12_shapegab12num_ctas_mnlcluster_shape_mnlr   r   s
             r,   r   z'PersistentDenseGemmKernel._compute_grid  s    . [!3_EE
"4z:::45;2.222!?N_``2AABSUhii $&&r.   r   rO   c                     |                      |dd                   }|                     t          j        ||                    }t	          j        |          }|S )a  
        Compute the number of tensor memory allocation columns.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler: The shape (M, N, K) of the MMA tile.
        :type mma_tiler: tuple[int, int, int]
        :param num_acc_stage: The stage of the accumulator tensor.
        :type num_acc_stage: int

        :return: The number of tensor memory allocation columns.
        :rtype: int
        Nr   )r   r   r7   r   r'   get_num_tmem_alloc_cols)r\   r   rO   rd  re  r[   s         r,   rZ   z6PersistentDenseGemmKernel._compute_num_tmem_alloc_cols  sU    & //	"1">>	//I}0U0UVV#;KHH""r.   ))r   r   r   __doc__r   r   Numericboolr   intr-   r_   r7   jitTensorr   	ConstexprcudaCUstreamr   r   TiledMmaCopyAtomr   LayoutComposedLayoutr   Tiler'   r  r   r  	TiledCopyr%  r'  r(  staticmethodr   rM   r   rZ    r.   r,   r   r   ^   sO       0 0d8H(8H 8H CHo	8H
  S/8H 8H 8H 8Htut ut utn 
X *P)O} };} ;} k	}
 ;} } %.} } &} } } X}@ 
[Y
'=Y
' MY
' 	Y
'
 MY
' Y
'  .Y
' T]+Y
' ;Y
' Y
' "[Y
' #1Y
' #1Y
' "'t{D4G'M!NY
' $DK1Dd$JKY
'  )!Y
'" I#Y
'$ !>%Y
'& &'Y
'( )Y
' Y
' Y
' [Y
'v>=m>= k>= ;	>=
 >= )>= I>= w45>= 
t~t{DKD	E>= >= >= >=@.Q.Q ;.Q [	.Q
 .Q m.Q {.Q K.Q 
t~t{DKdkSWS^^	_.Q .Q .Q .Q`EOmEO T]DN23EO T]DN23	EO
 ;EO EO )EO IEO {EO KEO 
t}dmT[$+t{TXT__	`EO EO EO EON hH=hHS#s]+hH go&hH go&	hH
 )hH IhH )hH %hH go&hH "hH hH hH 
sC}	hH hH hH \hHT 'k'!#sC-0'  S/' %.	'
 
u2E#sC-4HH	I' ' ' \'@ #=#c3'# # 
	# # # \# # #r.   r   c                       e Zd ZdZdeej                 dedee	e	f         dee	e	f         fdZ
ej        d fdej        d	ej        ee	e	e	f                  d
ej        ee	e	e	f                  dej        dej        ee	e	e	f                  dej        ee	e	e	f                  dej        dej        ee	e	e	f                  dej        ee	e	e	f                  dej        dej        dej        dej        dej        fd            ZdS )!PersistentDenseGemmKernelNoDlpackzWrapper around PersistentDenseGemmKernel that avoids DLPack.

    This wrapper constructs cute.Tensors directly from cute.Pointer, shapes, and
    explicit layout orders for operands A, B, AB12 and C.
    r   r   r   r   c                 6    t          ||||          | _        d S )N)r   r   r   r   )r   r   r*   s        r,   r-   z*PersistentDenseGemmKernelNoDlpack.__init__  s*     0+%-	
 
 
r.   c                 :    | dt          j        |  d          z   z  S ra   rb   re   s    r,   rg   z*PersistentDenseGemmKernelNoDlpack.<lambda>!  rh   r.   a_ptra_shapea_orderb_ptrb_shapeb_orderab12_ptrr  
ab12_orderc_cuterm   rn   ro   rp   c           
      :   t          j        |t          j        ||                    }t          j        |t          j        ||                    }t          j        |t          j        ||	                    }|                     ||||
||||           d S )N)order)r  )r7   r  make_ordered_layoutr   )r+   r  r  r  r  r  r  r  r  r  r  rm   rn   ro   rp   a_cuteb_cute	ab12_cutes                     r,   r   z*PersistentDenseGemmKernelNoDlpack.__call__  s    $ !%0HX_0`0`0`aaa!%0HX_0`0`0`aaa$Xd6Nzak6l6l6lmmm			
 		
 		
 		
 		
r.   N)r   r   r   r  r   r   r  r  r   r  r-   r7   r  Pointerr  r  r   r  r  r   r  r.   r,   r  r    s        
(
 
 CHo	

  S/
 
 
 
 
X  *P)O
 
|
 "5c3#78
 "5c3#78	

 |
 "5c3#78
 "5c3#78
 ,
 %eCcM&:;
 %eCcM&:;
 
 
 %.
 
 &
 
 
 X
 
 
r.   r  ) typingr   r   r   r   cuda.bindings.driverbindingsdriverr  r   cutlass.cuter7   cutlass.cute.nvgpur   r   cutlass.utilsr'   cutlass.pipeliner   cutlass.cute.testingtestingcutlass.utils.blackwell_helpersblackwell_helpersr2   cutlass.cute.runtimer	   cutlass.cute.mathrc   inspectr/  r   r  r  r.   r,   <module>r     sy  : 0 / / / / / / / / / / / # # # # # # # # #        / / / / / / / /       # # # # # # & & & & & & & & & 5 5 5 5 5 5 5 5 5 , , , , , ,                    
,^[# [# [# [# [# [# [# [#|,3
 3
 3
 3
 3
 3
 3
 3
 3
 3
r.   