
    `io              -       ^   d Z ddlmZmZ ddlmZ ddlZddlm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ  G d de          ZddlZ ej        e          Zi Zdddej         ej!        ej!        ddddddddfdej"        dej"        dej"        dej"        dej"        dej"        dej"        de
ej"                 de
ej"                 de
ej"                 dej#        dej#        dej#        de$d e	e%e%f         d!e
e	e%e%f                  d"e%d#e&d$e%d%e&d&e
ej'                 d'ef,d(Z(dS ))z
API for Grouped GEMM SwiGLU Forward Kernel (SM100+)

This module provides the API class for contiguous grouped block-scaled GEMM
with SwiGLU activation for MoE (Mixture of Experts) workloads.
   )&BlockScaledContiguousGroupedGemmKernel.BlockScaledContiguousGroupedGemmKernelNoDlpack    )driverN)TupleOptional)from_dlpackmake_ptr)version)_convert_to_cutlass_data_type)APIBase	TupleDictceil_divis_power_of_2c            /       j    e Zd ZdZddddddej        ddddddfdej        dej        d	ej        d
ej        dej        dej        dej        dej        dej        dej        deej                 deej                 deej                 deej                 deej                 deej                 dej        de	e
e
f         dee	e
e
f                  de
dede
def. fdZdefd Zd5d!eej                 ddfd"Z	 	 	 	 	 	 	 	 	 d6d#ej        d$ej        d%ej        d&ej        d'ej        d(ej        d)ej        d*ej        d+ej        d,eej                 d-eej                 d.eej                 d/eej                 d0eej                 d1eej                 d2eej                 d!eej                 d3eddf&d4Z xZS )7GroupedGemmSwigluSm100am  API class for Grouped GEMM SwiGLU forward operation on SM100+ GPUs.

    This kernel performs contiguous grouped block-scaled GEMM with SwiGLU activation,
    designed for MoE (Mixture of Experts) workloads.

    Key features:
    - Supports variable M per group (aligned to cta_tile_m)
    - Contiguous memory layout for A and D tensors
    - Block-scaled quantization support (MXF8, MXF4, NVF4)

    Example:
        >>> api = GroupedGemmSwigluSm100(
        ...     sample_a=a_tensor,
        ...     ...
        ... )
        >>> api.check_support()
        >>> api.compile()
        >>> api.execute(..., stream)
    N   r      Fr   sample_asample_bsample_csample_d
sample_sfa
sample_sfbsample_tile_idx_to_expert_idxsample_num_non_exiting_tilessample_alphasample_d_colsample_sfd_rowsample_sfd_colsample_amaxsample_norm_constsample_probsample_m_split_cumsum	acc_dtypemma_tiler_mncluster_shape_mnsf_vec_size
vector_f32	m_aligneddiscrete_col_sfdc                    t                                                       | j                            d           | j                            d           || _        || _        || _        || _        || _	        || _
        || _        || _        |	| _        |
| _        || _        || _        || _        |                     |dd          | _        || _        || _        || _        || _        |d         dk    | _        || j        rdnd	| _        n|| _        || _        || _        || _        || _        | j        j        }| j        j        }| j        j        }t?          j         tB          j"                  }| #                    |          }| #                    |          }| #                    |          }| $                    |          }| $                    |          } | $                    |          }!t?          j         |j%                  t?          j         d
          k    }"|p|p
|p|s| s|!o|" }#|#r'| j                            d           tL          | _'        ntP          | _'        d| _)        | j                            d           dS )a  Initialize the GroupedGemmSwigluSm100 API.

        :param sample_a: Sample A tensor (valid_m, k, 1)
        :param sample_b: Sample B tensor (n, k, l) where l = num_groups
        :param sample_c: Sample C tensor for intermediate storage
        :param sample_d: Sample D output tensor (valid_m, n/2, 1) after SwiGLU
        :param sample_sfa: Sample scale factor A tensor
        :param sample_sfb: Sample scale factor B tensor
        :param sample_tile_idx_to_expert_idx: Mapping from tile index to expert/group index
        :param sample_num_non_exiting_tiles: Number of valid tiles
        :param sample_alpha: Per-group alpha scaling factors
        :param sample_d_col: Column-quantized D tensor (required for quant kernel)
        :param sample_sfd_row: Optional row scale factor for D
        :param sample_sfd_col: Optional column scale factor for D
        :param sample_amax: Optional amax tensor for quantization
        :param sample_norm_const: Optional normalization constant
        :param sample_prob: Optional probability tensor for gating
        :param sample_m_split_cumsum: Optional m split cumulative sum tensor. Required when discrete_col_sfd is True.
        :param acc_dtype: Accumulator data type
        :param mma_tiler_mn: MMA tiler shape (M, N)
        :param cluster_shape_mn: Cluster shape (M, N)
        :param sf_vec_size: Scale factor vector size
        :param vector_f32: Use vectorized f32 operations
        :param m_aligned: Alignment for group M dimension
        :param discrete_col_sfd: Boolean, True to generate discrete col-major scale factor tensor. Only applies when already output scale factor tensors are provided.
        z-GroupedGemmSwigluSm100 is an experimental APIzEntering __init__r   
norm_constr   r   N)   r   )r   r   z2.10.0zQUsing NoDlpack kernel due to FP4 dtype or FP8 dtype on incompatible torch versionTz__init__ completed)*super__init___loggerwarningdebugr   r   r   r   r   r   r   r   r   r   r    r!   r"   _unpad_tensor_to_ndimr#   r$   r%   r&   r'   use_2cta_instrsr(   r)   r*   r+   r,   dtyper   parsetorch__version__	_is_fp4x2_is_fp8base_versionr   _kernelr   _interpret_uint8_as_fp4x2)%selfr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   ab_dtypec_dtyped_dtypetorch_version	is_ab_fp4is_c_fp4is_d_fp4	is_ab_fp8is_c_fp8is_d_fp8_fp8_dlpack_supporteduse_no_dlpack_kernel	__class__s%                                       ~/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/grouped_gemm/grouped_gemm_swiglu/api.pyr1   zGroupedGemmSwigluSm100.__init__J   s   n 	LMMM./// !   $$-J*,H)( ),,&!%!;!;<MqR^!_!_&%:" #(+A#5#.2.B$NFFD!!$4D!&$" 0 =&-%-%e&788NN8,,	>>'**>>'**LL**	<<((<<(( 'm.H I IW][cMdMd d(   HH   H   HiFgS[Fg_g  FG  rG  nG 	BLrsssIDLLADL)-&011111    returnc           	      D     j                             d           t          d  j         j         j        fD                       }t          d  j         j         j        fD                       }                     |p| d           | _         j        r( j        s! j         	                    d           d _                              j        o j
        du d            j                             d	                                 j        d
          \  }}}                      j        d          \  }}}                      j        d          \  }}}                      j        d          \  }}	}                      j        ||dfd                                 j        |||fd                                 j        ||dfd                                 j        ||dz  dfd                                 j        ||dz  dfd           t%          t%          | j                  d          }
                      j        ddt%          |d          d|
dfd                                 j        ddt%          |d          d|
|fd           t%          t%          |dz   j                  d          }                      j        ddt%          |d          d|dfd           t%          t%          | j                  d          }                      j        ddt%          |dz  d          d|dfd                                 j        |fd                                 j        |ddfd                                 j        |dfd                                 j        d d!                                 j        d d"                                 j
        |dz   fd#                                 j        |d||z  fgd$%          \  } _                              j        |d||z  fgd&%          \  } _                              j        |d||z  fgd'%          \  } _                              j        |	d||	z  fgd(%          \  } _                              j        |	d||	z  fgd)%          \  } _         j         _          j                             d*            !                     j        tD          j#        tD          j$        tD          j%        tD          j&        gd+,           _'         !                     j         j'        dd-.            !                     j        tD          j(        tD          j&        gd/,           _)         !                     j         j)        dd0.            !                     j         j)        dd1.            !                     j         j)        dd2.                                 j        d3vd4 j                                          j)        tD          j&        fv o
 j        dk    d5 j)         d6 j         d7                                 *                     j'                  o
 j        d8k    d9 j'         d6 j         d7            !                     j+        tD          j,        d:d;.            !                     j        tD          j,        tD          j-        tD          j.        tD          j&        tD          j%        tD          j#        gdd<.           _/         0                     j'                  r: !                     j        tD          j.        tD          j,        gdd=.           _1        nY !                     j        tD          j-        tD          j.        tD          j&        tD          j%        tD          j#        gd,           _1         !                     j         j1        dd>.            2                     0                     j'                  o j        d8k    o j1        tD          j,        k    d?            j                             d@                                 j3         o j4        dA         dBvdC j4        dA                                           j3        o j4        dA         dDvdE j4        dA                                           j4        d         dDvdF j4        d                                           j5        dA          j3        rdndz  dAk    dG j5        dA                                           j5        dA          j5        d         z  d8k    ow j5        dA         dAk    of j5        d         dAk    oU j5        dA         dk    oD j5        d         dk    o3tm           j5        dA                   otm           j5        d                    dH j5        dA          dI j5        d                      j5        dA          j3        rdndz   j4        dA         z  }                     |dDvdJ|                                  j7         j4        dA         z  dAk    dK j7         dL j4        dA          dM            j                             dN            fdO}                      | j'         j        |||f          o3 | j'         j        |||f          o | j1         j         |||f           dP            2                     *                     j'                  o* j4        d         dk    o *                     j1                  dQdRz               2                     0                     j'                  o j/        tD          j-        tD          j.        fvdS j/                    tD          j8        9                                stu          dT          tD          j8        ;                                }tD          j8        <                    |          \  }}|dUz  |z   }|dVk     rtu          dW| dX|           dY _=         j                             dZ           dYS )[zxCheck if the kernel configuration is supported.

        :return: True if supported, raises exception otherwise
        zEntering check_supportc              3      K   | ]}|d u V  	d S N .0xs     rN   	<genexpr>z7GroupedGemmSwigluSm100.check_support.<locals>.<genexpr>   s&      mmQqDymmmmmmrO   c              3      K   | ]}|d uV  	d S rS   rT   rU   s     rN   rX   z7GroupedGemmSwigluSm100.check_support.<locals>.<genexpr>   s&      rr!rrrrrrrO   zOsample_sfd_row, sample_sfd_col, and norm_const must be all None or all not NonezTdiscrete_col_sfd is True but generate_sfd is False, discrete_col_sfd will be ignoredFNz?sample_m_split_cumsum is required when discrete_col_sfd is Truez"Checking tensor shapes and stridesr   namer   r   r   r   ABCr/   DD_col          SFASFBSFD_rowSFD_colalphaprobamax)r   num_non_exiting_tilesr.   m_split_cumsumzA must have k-major layout)strideextra_error_msgzB must have k-major layoutzC must have n-major layoutzD must have n-major layoutzD_col must have n-major layoutzChecking data typeszA/B)r7   r[   zB must have the same dtype as A)r7   r[   rn   zSFA/SFB/SFD_row/SFD_colz#SFB must have the same dtype as SFAz'SFD_row must have the same dtype as SFAz'SFD_col must have the same dtype as SFA)r   rb   z"sf_vec_size must be 16 or 32, got z	sf_dtype z and sf_vec_size z combination is not supportedr   z	ab_dtype AccumulatorzAccumulator must be float32zC must have the same dtype as Az.D must be bf16 or float32 when ab_dtype is fp4z#D_col must have the same dtype as DzInvalid configuration: fp4 ab_dtype, sf_vec_size 16, d_dtype float32 is not supported. Please use sf_vec_size 32 or d_dtype bf16 insteadz)Checking MMA tile shape and cluster shaper   )@   rc   z>MMA tiler M must be 64 or 128 when use_2cta_instrs=False, got )rc   r   z>MMA tiler M must be 128 or 256 when use_2cta_instrs=True, got z$MMA tiler N must be 128 or 256, got zJcluster_shape_mn[0] must be divisible by 2 when use_2cta_instrs=True, got zrInvalid cluster shape: expected values to be powers of 2 and cluster_shape_mn[0] * cluster_shape_mn[1] <= 16, got ,zIInvalid cluster tiler shape: expected cluster_tiler_m in {128, 256}, got zNInvalid m_aligned: expected m_aligned to be divisible by mma_tiler_mn[0], got z % z != 0zChecking tensor alignmentc                 z    |dk    }|rdnd}||         }dt          | j                  j        z  }||z  dk    S )N)r   r   r/   r   r   rc   )interpret_uint8_as_fp4x2)r   r?   width)r7   stride_ordertensor_shapeis_mode0_majormajor_mode_idxnum_major_elementsnum_contiguous_elementsr@   s          rN   check_contigous_16B_alignmentzKGroupedGemmSwigluSm100.check_support.<locals>.check_contigous_16B_alignmentb  sj    )Y6N"07QQaN!-n!=&,1Nuos  pN  2O  2O  2O  2U  'V#%(??1DDrO   z5Invalid tensor alignment: tensors must be 16B alignedzsInvalid configuration: fp8 ab_dtype and sf_vec_size 32 with mma_tiler_mn[1] == 128 and fp8 d_dtype is not supportedz)Please use mma_tiler_mn[1] == 256 insteadzRInvalid configuration: for fp4 ab_dtype, c_dtype must be float16 or bfloat16, got zCUDA is not available
   d   zBGroupedGemmSwiglu requires SM100+ compute capability, but found SMz on device Tz$check_support completed successfully)>r2   r4   allr    r!   r#   _value_error_ifgenerate_sfdr,   r3   r%   _tensor_shaper   r   r   r   _check_tensor_shaper   r   r)   r   r   r   r$   r"   r   _check_tensor_stridea_stride_orderb_stride_orderc_stride_orderd_stride_orderd_col_stride_ordercd_stride_order_check_dtyper9   float4_e2m1fn_x2uint8float8_e5m2float8_e4m3fnrA   float8_e8m0fnusf_dtyper<   r&   float32float16bfloat16rB   r;   rC   _not_implemented_error_ifr6   r'   r(   r   r+   cudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r@   all_none	none_nonetensor_mk_onen_ln_2rest_krest_n2rest_mcluster_tiler_mr{   devicemajorminorcompute_capabilitys   `                  rN   check_supportz$GroupedGemmSwigluSm100.check_support   s   
 	3444mm4+>@SUYUk*lmmmmmrr0CTEXZ^Zp/qrrrrr	&Y']	
 	
 	
 &  	*): 	*L  !wxxx$)D!T2Yt7QUY7Y  \]  	^  	^  	^?@@@ ..t}:.NN!T$$T]$DD1a''J'GG
1d))$-j)II3  1a0@#FFF  Aq	3???  1a0@#FFF  1610EsKKK  !2XqAvq4I7SSS(1d&677;;  2q(8S:Q:QSTV\^_2`bghhh  2q(1c:J:JAvWX2Y[`aaa8AFD,<==qAA  !4r1hxQT>U>UWXZacd6egpqqq(8T-=>>BB  !4r1hqAvs>S>SUVX^`a6bdmnnn  !2QD'BBB  !1Ha3CVLLL  !1Aq66BBB  !BDJabbb  !7|LLL  !;a!eXGWXXX!%!:!:4=STVWYadeYeRfQg  zV!:  "W  "W4!%!:!:4=STVWYZ]^Y^R_Q`  sO!:  "P  "P4!%!:!:4=STVWYadeYeRfQg  zV!:  "W  "W4!%!:!:4=SVXY[cfi[iRjQk  ~Z!:  "[  "[4%)%>%>Q3'?&@Rr &? &
 &
"4"  $20111))M&!#	  * 	
 	
 	$-t}3Xyzzz))O')<=* * 
 

 	$/U  ]B  	C  	C  	C$-T]  eN  	O  	O  	O$-T]  eN  	O  	O  	OT-X=?vdhdt?v?vwwwMe122Mt7G27Mggg8Hggg	
 	
 	
 	LL''BD,<,B  E`PTP]  E`  E`pt  qA  E`  E`  E`	
 	
 	
 	$.M  dA  	B  	B  	B((M=%-ATV[Vgini  A=	 ) 
 
 >>$-(( 	,,enem%D3  aQ -  DLL  ,,MN'%*  - 
 
DL 	$+4<g  `E  	F  	F  	F&&NN4=))fd.>".DfY^YfIf X	
 	
 	

 	FGGG$$N):1)=Y)NcTM^_`Macc	
 	
 	
 	 KT%6q%9%KcTM^_`Macc	
 	
 	
 	T.q1C  FClpl}~  mA  FC  FC  	D  	D  	D!!$T-A(HqIQNsY]YnopYqss	
 	
 	
 	%a(4+@+CCrI <)!,q0<)!,q0< )!,1< )!,1	<
 "$"7":;;< "$"7":;; w  BF  BW  XY  BZ  w  w  ]a  ]r  st  ]u  w  w	
 	
 	
  03T=Q8XWXY]a]nop]qq
 	_J>  A`  O^  A`  A`  	a  	a  	a
 	NT.q11Q6 L]a]k  L  Lpt  qB  CD  qE  L  L  L	
 	
 	

 	6777	E 	E 	E 	E 	E 	--dmT=PS[]^`aRbcc h11$-ATWXZ[]^V_``h11$,@TW_abdeVfgg
 D	
 	
 	
 	&&\\$-((lt/@/Cs/JlQUQ]Q]^b^jQkQk C:;	
 	
 	

 	&&NN4=))ct|EMSXSaCb/boaeamoo	
 	
 	
 z&&(( 	86777**,,z77??u"RZ%/##   Phz   P   P  HN   P   P  Q  Q  Q!ABBBtrO   current_streamc           
         | j                             d           |                     |          }|                                  |                     | j        t          | j                  | j        | j	        | j
        | j        | j        | j                  }t          j                                        }|                    | j
        d         | j
        d         z            }| j        t$          u r| j                             d           t'          j        |fi dt+          | j        d          d	t+          | j        d          d
t+          | j        d          dt+          | j        d          d| j        t+          | j        d          nddt+          | j        d          dt+          | j        d          d| j        t+          | j        d          ndd| j        t+          | j        d          ndd| j        t+          | j        d          ndd| j         t+          | j                   nddt+          | j!        d          dt+          | j"        d          d| j#        t+          | j#        d          nddt+          | j$        d          d| j%        t+          | j%        d          ndd|d|| _&        n| j        tN          u r| j                             d           | (                    | j        d          \  }}}| (                    | j        d          \  }}	}
| (                    | j        d          \  }}}| (                    | j        d           \  }}}| (                    | j        d!          \  }}}| (                    | j        d"          \  }}}| (                    | j        d#          \  }}}| (                    | j        d$          \  }}}| (                    | j        d%          \  }}}| (                    | j        d&          \  } }!}"| (                    | j         d'          \  }#}$}%| (                    | j!        d(          \  }&}'}(| (                    | j"        d)          \  })}*}+| (                    | j#        d          \  },}-}.| (                    | j$        d          \  }/}0}1| (                    | j%        d          \  }2}3}4t'          j        |fi d*|d+|d,|d-|d.|	d/|
d0|d1|d2|d3|d4|d5|d6|d7|d8|d9|d:|d;|d<|d=|d>|d?|d@|dA|dB|dC|dD|dE| dF|!dG|"dH|#dI|$dJ|%dK|&dL|'dM|(dN|)dO|*dP|+dQ|,dR|-dS|.dT|/dU|0dV|1dW|2dX|3dY|4d|d|| _&        ntS          dZ| j                   | j                             d[           dS )\zOCompile the kernel.

        :param current_stream: CUDA stream to use
        zEntering compiler)   r&   r6   r'   r(   r*   r   r,   r   r   z-Compiling grouped_gemm_swiglu kernel (dlpack)ar   assumed_alignbcdd_colNsfasfbsfd_row_tensorsfd_col_tensoramax_tensornorm_const_tensortile_idx_to_expert_idxrk   rl   rh   ri   max_active_clustersstreamz0Compiling grouped_gemm_swiglu kernel (no_dlpack)r\   rZ   r]   r^   r_   r`   rd   re   rf   rg   rj   r.   tile_idx	num_tilesa_ptra_shapea_orderb_ptrb_shapeb_orderc_ptrc_shapec_orderd_ptrd_shaped_order	d_col_ptrd_col_shaped_col_ordersfa_ptr	sfa_shape	sfa_ordersfb_ptr	sfb_shape	sfb_ordersfd_row_ptrsfd_row_shapesfd_row_ordersfd_col_ptrsfd_col_shapesfd_col_orderamax_ptr
amax_shape
amax_ordernorm_const_ptrnorm_const_shapenorm_const_ordertile_idx_to_expert_idx_ptrtile_idx_to_expert_idx_shapetile_idx_to_expert_idx_ordernum_non_exiting_tiles_ptrnum_non_exiting_tiles_shapenum_non_exiting_tiles_orderm_split_cumsum_ptrm_split_cumsum_shapem_split_cumsum_order	alpha_ptralpha_shapealpha_orderprob_ptr
prob_shape
prob_order!Unreachable: invalid kernel type zKernel compiled successfully)*r2   r4   _get_default_stream_ensure_support_checkedr>   r)   r   r&   r6   r'   r(   r*   r   r,   cutlassutilsHardwareInfoget_max_active_clustersr   cutecompiler	   r   r   r   r   r   r   r   r    r!   r"   r#   r   r   r%   r   r$   _compiled_kernelr   _make_cute_tensor_descriptorNotImplementedError)5r@   r   gemm_swigluhardware_infor   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   tile_idx_ptrtile_idx_shapetile_idx_ordernum_tiles_ptrnum_tiles_shapenum_tiles_orderr   r   r   r   r   r   r   r   r   s5                                                        rN   r   zGroupedGemmSwigluSm100.compile  s   
 	-...11.AA$$&&&ll(3DNCC 0*!2*!2 # 	
 	
  2244+CCDDYZ[D\_c_tuv_wDwxx<AAALNOOO$(L% % %dm2>>>>% dm2>>>>% dm2>>>>	%
 dm2>>>>% KOJ[Jgk$"32FFFFmq%  rBBBB%  rBBBB% VZUhUt{4+>bQQQQz~% VZUhUt{4+>bQQQQz~% PTO_OkK(8KKKKqu% JNI_Ik+d.D"E"E"Equ% (343Ueg'h'h'h'h% '2$2Sce&f&f&f&f% ]a\v  ]C{4+EUWXXXX  IM%  "$"32FFFF!%" IMHXHd[!1DDDDjn#%$ %8$7%%& &~'%D!!* \KKKLQRRR&*&G&G\_&G&`&`#E7G&*&G&G\_&G&`&`#E7G&*&G&G\_&G&`&`#E7G&*&G&G\_&G&`&`#E7G262S2STXTels2S2t2t/I{K,0,M,Mdodi,M,j,j)GY	,0,M,Mdodi,M,j,j)GY	8<8Y8YZ^Zmt}8Y8~8~5K8<8Y8YZ^Zmt}8Y8~8~5K/3/P/PQUQahn/P/o/o,Hj*AEAbAbcgcy  AMAb  BN  BN>N,.>;?;\;\]a]  GQ;\  <R  <R8L..>B>_>_`d  aB  IT>_  ?U  ?U;M?OMQMnMn*1A No N NJ 46J 372S2STXTels2S2t2t/I{K/3/P/PQUQahn/P/o/o,Hj*$(L4% 4% 4%e4%  4%  	4%
 e4%  4%  4% e4%  4%  4% e4%  4%  4% $)4% (K4%  (K!4%"  #4%$ $)%4%& $)'4%(  )4%* $)+4%, $)-4%. (K/4%0 ,m14%2 ,m34%4 (K54%6 ,m74%8 ,m94%: ";4%< &:=4%> &:?4%@  .~A4%B "2!1C4%D "2!1E4%F ,8<G4%H .<^I4%J .<^K4%L +8-M4%N -<OO4%P -<OQ4%R $6#5S4%T &:%9U4%V &:%9W4%X $)Y4%Z (K[4%\ (K]4%^ "_4%` &:a4%b &:c4%d %8$7e4%f &~g4%D!!l &&X$,&X&XYYY9:::::rO   a_tensorb_tensorc_tensord_tensor
sfa_tensor
sfb_tensorr   rk   alpha_tensord_col_tensorr   r   r   r   prob_tensorrl   skip_compilec                    | j                             d           |                     |          }|                     |dd          }|sR|                     | j        du d           | j        t          u rN| j                             d           |                     t          |d          t          |d          t          |d          t          |d          |
t          |
d          ndt          |d          t          |d          |t          |d          nd|t          |d          nd|t          |d          nd|t          |d          ndt          |d          t          |d          |t          |d          ndt          |	d          |t          |d          nd|	           n| j        t          u r| j                             d
           | 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |
d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          }| 
                    |d          } | 
                    |	d          }!| 
                    |d          }"|                     |||||||||||||| |!|"|           nt          d| j                   | j                             d           |duo|duo|du}#| j        o|#}$|                     | j        t          | j                  | j        | j        | j        | j        |#|$          }%t(          j                                        }&|&                    | j        d         | j        d         z            }'| j        t          u rg| j                             d            |%dbi dt          |d          dt          |d          dt          |d          dt          |d          d|
t          |
d          nddt          |d          dt          |d          d|t          |d          ndd|t          |d          ndd|t          |d          ndd|t          |          nddt          |d          dt          |d          d| j        t          |d          nddt          |	d          d |t          |d          ndd!|'d"| n| j        t          u rg| j                             d#           |                     |d$%          \  }}(})|                     |d&%          \  }}*}+|                     |d'%          \  }},}-|                     |d(%          \  }}.}/|                     |
d)%          \  }}0}1|                     |d*%          \  }}2}3|                     |d+%          \  }}4}5|                     |d,%          \  }}6}7|                     |d-%          \  }}8}9|                     |d.%          \  }}:};|                     |d%          \  }}<}=|                     |d/%          \  }}>}?|                     |d0%          \  }}@}A|                     |	d%          \  }!}B}C|                     |d %          \  }"}D}E|                     |d%          \  } }F}G |%dbi d1|d2|(d3|)d4|d5|*d6|+d7|d8|,d9|-d:|d;|.d<|/d=|d>|0d?|1d@|dA|2dB|3dC|dD|4dE|5dF|dG|6dH|7dI|dJ|8dK|9dL|dM|:dN|;dO|dP|<dQ|=dR|dS|>dT|?dU|dV|@dW|AdX| dY|FdZ|Gd[|!d\|Bd]|Cd^|"d_|Dd`|Ed!|'d"| nt          d| j                   | j                             da           dS )ca  Execute the compiled kernel.

        :param a_tensor: Input A tensor
        :param b_tensor: Input B tensor (weights)
        :param c_tensor: Intermediate C tensor
        :param d_tensor: Output D tensor
        :param sfa_tensor: Scale factor A
        :param sfb_tensor: Scale factor B
        :param tile_idx_to_expert_idx: Tile to expert mapping
        :param num_non_exiting_tiles: Number of valid tiles
        :param alpha_tensor: Per-group scaling factors
        :param d_col_tensor: Optional column-quantized output
        :param sfd_row_tensor: Optional row scale factor D
        :param sfd_col_tensor: Optional column scale factor D
        :param amax_tensor: Optional amax tensor
        :param norm_const_tensor: Optional normalization constant
        :param prob_tensor: Optional probability tensor
        :param m_split_cumsum: Optional m split cumulative sum tensor
        :param current_stream: CUDA stream
        :param skip_compile: If True, use JIT execution without prior compilation
        zEntering executer   r.   NzBKernel not compiled; call compile() first or use skip_compile=Truez-Executing grouped_gemm_swiglu kernel (dlpack)r   r   )r   r   r   r   r   r   r   r   r   r   r   r   rk   rl   rh   ri   r   z0Executing grouped_gemm_swiglu kernel (no_dlpack))r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z'Executing without compiled kernel (JIT)r   r   z1JIT executing grouped_gemm_swiglu kernel (dlpack)r   r   r   r   r   r   r   r   r   r   r   r   rk   rl   rh   ri   r   r   z4JIT executing grouped_gemm_swiglu kernel (no_dlpack)r\   rZ   r]   r^   r_   r`   rd   re   rf   rg   rj   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   zExecute completedrT   )r2   r4   r   r5   _runtime_error_ifr   r>   r   r	   r   _make_cute_pointerr   r,   r)   r   r&   r6   r'   r(   r*   r   r   r   r   rl   r   )Hr@   r  r  r  r	  r
  r  r   rk   r  r  r   r   r   r   r  rl   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r,   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r   r   r   r   r   sH                                                                           rN   executezGroupedGemmSwigluSm100.execute  s   T 	-...11.AA 667H!\ZZ v	^""%-T  
 |EEE""#RSSS%%!("===!("===!("===!("===IUIa+l"EEEEgk#JbAAA#JbAAATbTn;~R#P#P#P#PtxTbTn;~R#P#P#P#PtxNYNeKr J J J JkoZkZwk2CSU&V&V&V&V  ~B+67M]_+`+`+`*56K[]*^*^*^TbTn;~R#P#P#P#Ptx%l"EEEGRG^[CCCCdh)# &    & !OOO""#UVVV///KK///KK///KK///KK 33LPR3SS	11*B1OO11*B1OO"55nTV5WW"55nTV5WW22;b2QQ!%!8!89JZ\!8!]!]#667M]_6`` $ 7 78M]_ 7 ` `%)%<%<^[]%<%^%^" 33LPR3SS	22;b2QQ%%'## + +%#1/;.;'9'%)# &    ( **\dl*\*\]]]LHIII)5v.PT:TvYjrvYvL#4E,, ,7GG $ 4!.!%!6?)!1 ' 	 	K $M6688M"/"G"GH]^_H`cgcxyzc{H{"|"||EEE""#VWWW   !("====!("==== "("==== "("====	
 JVIa+l"EEEEgk $JbAAAA $JbAAAA UcTn;~R#P#P#P#Ptx UcTn;~R#P#P#P#Ptx OZNeKr J J J Jko IZHek2C&D&D&Dko ,77M]_+`+`+`+` +66K[]*^*^*^*^ UYTgTs;~R#P#P#P#Py} &l"EEEE  HSG^[CCCCdh!" )<(;#$ *>%  ( !OOO""#YZZZ*.*K*KH[^*K*_*_'w*.*K*KH[^*K*_*_'w*.*K*KH[^*K*_*_'w*.*K*KH[^*K*_*_'w6:6W6WXdkr6W6s6s3	;040Q0QR\ch0Q0i0i-I040Q0QR\ch0Q0i0i-I<@<]<]^ls|<]<}<}9]M<@<]<]^ls|<]<}<}9]M373T3TU`gm3T3n3n0*jEIEfEfgx  @LEf  FM  FMB 02B?C?`?`aw  I?`  @J  @J<nnBFBcBcdy  ALBc  CM  CM?6:6W6WXdkr6W6s6s3	;373T3TU`gm3T3n3n0*jQUQrQr  tB  IYQr  RZ  RZN"$8:N 3 3 3%3#G3 $G3  %	3
 $G3 $G3  %3 $G3 $G3  %3 $G3 $G3 (i3 !,3 !,3  $G!3" (i#3$ (i%3& $G'3( (i)3* (i+3, !,-3. #0-/30 #0-132 !,334 #0-536 #0-738 &X93:  *z;3<  *z=3> $2>?3@ &6%5A3B &6%5C3D 0<|E3F 2@G3H 2@I3J /<mK3L 1@M3N 1@O3P (:'9Q3R *>)=S3T *>)=U3V (iW3X !,Y3Z !,[3\ &X]3^  *z_3`  *za3b )<(;c3d *>e3 3 3j **\dl*\*\]]]./////rO   rS   )	NNNNNNNNF)__name__
__module____qualname____doc__r9   r   Tensorr   r7   r   intboolr1   r   r   CUstreamr   r  __classcell__)rM   s   @rN   r   r   5   s        D 2615.248.28<!&(26: !&7u2 u2,u2 ,u2 ,	u2
 ,u2 Lu2 Lu2 (-|u2 ',lu2 lu2 lu2 !.u2 !.u2  el+!u2" $EL1#u2$ el+%u2&  (5'u2* ;+u2, CHo-u2. #5c?3/u20 1u22 3u24 5u26 7u2 u2 u2 u2 u2 u2nGt G G G GR|; |;ht}&= |; |; |; |; |;R 041515.248.21526"'g0 g0,g0 ,g0 ,	g0
 ,g0 Lg0 Lg0 !&g0  %|g0 lg0 u|,g0 !.g0 !.g0 el+g0 $EL1g0  el+!g0" !.#g0$ !/%g0& 'g0( 
)g0 g0 g0 g0 g0 g0 g0 g0rO   r   r   r   r   Fr   r  r  r
  r  r   rk   r  r   r  rl   r&   rB   rC   cd_majorr'   r(   r)   r*   r+   r,   r   rP   c                    | j         \  }}}|j         \  }}}|dz  }t                              d           |dk    rst          j        ||df|d||z  f|| j                  }t          j        ||df|d||z  f|| j                  }t          j        ||df|d||z  f|| j                  }nt          d|           d}d}d} | j        t          j        t          j	        fv r|j        t          j
        t          j        fv rt                              d           |j        }!d	}"t          ||          }#dt          |d
          t          |#d          dddf}$t          j        |$|!| j                                      |"          }t          ||          }%dt          |d
          t          |%d          dddf}&t          j        |&|!| j                                      |"          }|t          j        t          j        fv rPt                              d           t          j        |dft#          d          t          j        | j                  } | j         |j         | j        |j        |                                 |                                |j         |j         |                                |                                |j        |j        ||j         nd||                                nd||j        nd|	|	j         nd|	|	                                nd|	|	j        nd|
|||||||||f}'|'t(          v rNt                              d           t(          |'         }(|(                    | |||||||||||| |||	|           nt                              d           t-          d,i d| d|d|d|d|d|d|d|d|d| d|d|d|d|d |d!|	d"|
d#|d$|d%|d&|d'|d(|}(|(                                s
J d)            |(                    |*           |(                    | |||||||||||| |||	|           |(t(          |'<   t3          |||| ||+          S )-a
  Convenience wrapper for grouped GEMM SwiGLU forward operation.

    This function creates the API, compiles, and executes in one call.
    Compiled kernels are cached for reuse when called with the same configuration.

    Args:
        a_tensor: Input A tensor (valid_m, k, 1)
        b_tensor: Weight B tensor (n, k, l)
        sfa_tensor: Scale factor A
        sfb_tensor: Scale factor B
        tile_idx_to_expert_idx: Tile to expert mapping
        num_non_exiting_tiles: Number of valid tiles
        alpha_tensor: Per-group scaling
        norm_const_tensor: Optional normalization constant. Required when using FP8
            input configurations (i.e., when a_tensor.dtype is FP8 and sfa_tensor.dtype is FP8).
            Should be None for FP4/BF16 input configurations.
        prob_tensor: Optional probability tensor for gating
        m_split_cumsum: Optional m split cumulative sum tensor. Required when discrete_col_sfd is True.
        acc_dtype: Accumulator data type
        c_dtype: Intermediate C tensor data type (always bfloat16)
        d_dtype: Output D tensor data type (fp8 when ab is fp8, bf16 when ab is fp4)
        cd_major: CD major dimension (note: only "n"-major layout is supported)
        mma_tiler_mn: MMA tiler shape
        cluster_shape_mn: Cluster shape
        sf_vec_size: Scale factor vector size
        vector_f32: Use vectorized f32
        m_aligned: M alignment
        discrete_col_sfd: Boolean, True to generate discrete col-major scale factor tensor. Only applies when already output scale factor tensors are provided.
        current_stream: CUDA stream

    Returns:
        TupleDict: A dictionary-like object containing output tensors that can also be unpacked as a tuple.
            Dictionary keys (also the unpacking order):
            - **c_tensor** (torch.Tensor): Intermediate result tensor
            - **d_tensor** (torch.Tensor): Final output tensor after SwiGLU
            - **d_col_tensor** (torch.Tensor): Column-wise output tensor
            - **amax_tensor** (torch.Tensor or None): Absolute maximum values (for quantization)
            - **sfd_row_tensor** (torch.Tensor or None): Row-wise scale factors for D (FP8 only)
            - **sfd_col_tensor** (torch.Tensor or None): Column-wise scale factors for D (FP8 only)

            Example usage::

                # Dictionary-style access
                result = grouped_gemm_swiglu_wrapper_sm100(...)
                c = result["c_tensor"]
                d = result["d_tensor"]

                # Tuple unpacking
                c, d, d_col, amax, sfd_row, sfd_col = grouped_gemm_swiglu_wrapper_sm100(...)

                # Integer indexing
                c = result[0]  # c_tensor
    r/   z[grouped_gemm_swiglu_wrapper_sm100: Creating output tensors c_tensor, d_tensor, d_col_tensorr   r   )r7   r   zcd_major must be 'n', got Nzugrouped_gemm_swiglu_wrapper_sm100: Detected fp8 a_dtype and sfa_dtype, constructing sfd_row_tensor and sfd_col_tensor)   ra   r      r/   r   rc   ra   rb   zZgrouped_gemm_swiglu_wrapper_sm100: Detected bf16/float16 d_dtype, constructing amax_tensorz-infzVgroup_gemm_swiglu_wrapper_sm100: Using previously cached GroupedGemmSwigluSm100 object)r  r  r  r	  r
  r  r   rk   r  r  r   r   r   r   r  rl   r   zgroup_gemm_swiglu_wrapper_sm100: No previously cached GroupedGemmSwigluSm100 object found, creating new GroupedGemmSwigluSm100 objectr   r   r   r   r   r   r   r   r   r"   r   r    r!   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   zUnsupported configuration)r   )r  r	  r  r   r   r   rT   )shaper2   r4   r9   empty_stridedr   
ValueErrorr7   r   r   r   r   emptypermuter   r   fullfloatr   rm   '_cache_of_GroupedGemmSwigluSm100Objectsr  r   r   r   r   ))r  r  r
  r  r   rk   r  r   r  rl   r&   rB   rC   r  r'   r(   r)   r*   r+   r,   r   valid_mr   r   r   r   n_outr  r	  r  r   r   r   r   mma_permute_ordersf_k_rowmma_shape_rowsf_k_colmma_shape_col	cache_keygrouped_gemm_swiglus)                                            rN   !grouped_gemm_swiglu_wrapper_sm100r2    s$   X NMGQnGAq!FEMMpqqq3&AAw{8KSZckcrsss&':UAwQV<W_fowo~*GUA+>7UZ?@[cjs{  tC  D  D  D@h@@AAANNK~%-u/@AAAjFV[`[oqv  rE  [F  GF  GF  N  	O  	O  	O#. E;//Wc""Xq!!
 ](8?[[[ccduvv G[11UC  Xq!!
 ](8?[[[ccduvv5>5=111rsssj!QvemT\Tcddd 	#4#@d&7&C  """#4#@d . :#1#=4 . :9I> ;;;noooEiP##!!#9"7%%))#/#))# 	$ 	
 	
 	
 	
( 	  ^  	_  	_  	_4 
 
 
X
X
 X
 X	

 "z
 "z
 +A*@
 *?)>
 &
 $
 &
 *>
 *>
 0/
 $
  #1.!
"  i#
$ &%
& .-'
( $)
* "z+
,  i-
. .-/
4 #0022OO4OOO2##>#BBB##!!#9"7%%))#/#))# 	$ 	
 	
 	
& >Q/	:!%%   rO   ))r  grouped_gemm_swiglu_quantr   r   cuda.bindingsr   r   r9   typingr   r   r   cutlass.cuter   cutlass.cute.runtimer	   r
   	packagingr   cudnn.datatypesr   cudnn.api_baser   r   r   r   r   logging	getLoggerr  r2   r(  r   r   r  r7   strr  r  r  r2  rT   rO   rN   <module>r>     s  :         ) ( ( ( ( (  " " " " " " " "        6 6 6 6 6 6 6 6       9 9 9 9 9 9 F F F F F F F F F F F Fz
0 z
0 z
0 z
0 z
0W z
0 z
0 z
0z 
'
H
%
%*, ' 15*.-1"] > >$.26".2+o ololo o 	o
 "Lo !<o ,o  -o %,'o U\*o {o [o [o o S/o  uS#X/!o" #o$ %o& 'o( )o* T]++o, -o o o o o orO   