
    `ik              %       L   d dl mZmZ d dlmZmZ ddlmZ ddl	Z	ddl
mZmZ ddlZddlmZ ddlmZmZ ddlmZ ddlmc mZ dd	lmZ dd
lmZmZmZ  G d de          ZddlZ ej        e           Z!i Z"dde	j#        e	j$        e	j#        dddddddddfde	j%        de	j%        de&de'de	j(        de	j(        de	j(        dee)e)f         deee)e)f                  dee	j%                 dee	j%                 dee	j%                 de)d e*d!e)d"eej+                 d#ee	j%        d$f         f"d%Z,dS )&   )PersistentDenseGemmKernel!PersistentDenseGemmKernelNoDlpack))Sm100BlockScaledPersistentDenseGemmKernel1Sm100BlockScaledPersistentDenseGemmKernelNoDlpack    )driverN)TupleOptional)from_dlpackmake_ptr)version)_convert_to_cutlass_data_type)APIBaseceil_divis_power_of_2c            !       ~    e Zd Zdej        ddddddddddfdej        dej        d	ej        d
ej        dedej        dee	e	f         de
ee	e	f                  de
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de	dede	f  fdZdefdZd'de
ej                 ddfdZ	 	 	 	 	 	 	 	 d(dej        dej        dej        dej        d e
ej                 d!e
ej                 d"e
ej                 d#e
ej                 d$e
ej                 dede
ej                 d%eddfd&Z xZS ))GemmSwigluSm100      ?   r   N   F   sample_asample_bsample_ab12sample_calpha	acc_dtypemma_tiler_mncluster_shape_mn
sample_sfa
sample_sfbsample_amax
sample_sfcsample_norm_constsf_vec_size
vector_f32ab12_stagesc                    t                                                       | j                            d           | j                            d           || _        || _        || _        || _        || _	        || _
        || _        || j        d         dk    sdnd| _        n|| _        |	| _        |
| _        || _        |                     |dd          | _        |                     |dd	          | _        || _        || _        || _        | j        C| j        <| j        5| j        .| j        '| j                            d
           t,          | _        n&| j                            d           t0          | _        | j                            d                    g d|j         d|j         d|j         d|j         d| d| d| d| d|	|	j        nd  d|
|
j        nd  d||j        nd  d||j        nd  d||j        nd  d| d| d|                      d| _        d S )Nz&GemmSwigluSm100 is an experimental APIzEntering __init__r      r   r   )   r,   r   amax
norm_constzDNo quantization arguments provided, using regular GEMM swiglu kernelzCQuantization arguments provided, using quantized GEMM swiglu kernel z'__init__ completed with args: sample_a z, sample_b z, sample_ab12 z, sample_c z, alpha z, acc_dtype z, mma_tiler_mn z, cluster_shape_mn z, sample_sfa z, sample_sfb z, sample_amax z, sample_sfc z, sample_norm_const z, sf_vec_size z, vector_f32 z, ab12_stages T)super__init___loggerwarningdebugr   r   r   r   r   r   r   r    r!   r"   r$   _unpad_tensor_to_ndimr#   r%   r&   r'   r(   r   _kernelr   joinshape_interpret_uint8_as_fp4x2)selfr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   	__class__s                    i/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/cudnn/gemm_swiglu/api.pyr1   zGemmSwigluSm100.__init__5   sx   ( 	EFFF.///  & 
"(#262CA2F#2M2MFFSYD!!$4D! %$$55k1fMM!%!;!;<MqR^!_!_&$& ?"t'>4CSC[`d`o`w  }A  }S  }[Lefff4DLLLdeeeDDL |
  |
  |
  |
  |
  |
hn  |
  |
  |
  |
QYQ_  |
  |
  |
  |
oz  pA  |
  |
  |
  |
  NV  N\  |
  |
  |
  |
  fk  |
  |
  |
  |
  yB  |
  |
  |
  |
  S_  |
  |
  |
  |
  tD  |
  |
  |
  |
  gq  g}  S]  Sc  Sc  CG  |
  |
  |
  |
  jt  j@  V`  Vf  Vf  FJ  |
  |
  |
  |
  oz  oF  Ze  Zk  Zk  LP  |
  |
  |
  |
  s}  sI  _i  _o  _o  OS  |
  |
  |
  |
  D	U	  D	a	  iz  i@	  i@	  g	k	  |
  |
  |
  |
  {	F
  |
  |
  |
  |
  U
_
  |
  |
  |
  |
  o
z
  |
  |
  |
  |
	
 	
 	
 *.&&&    returnc           	      t     j                             d            j                             d                                 j        d          \  }}}                      j        d          \  }}}                      j        d          \  }}}                      j        d          \  }}}                      j        |||fd                                 j        |||fd	                                 j        |||fd
                                 j        ||dz  |fd            j        t          t          hv rt          t          | j                  d          }                      j        ddt          |d          d||fd                                 j        ddt          |d          d||fd                                 j        dd           t          t          |dz   j                  d          }                      j        ddt          |d          d||fd                                 j        dd                                 j        d|||z  f|d||z  fg          \  } _                              j        d|||z  f|d||z  fg          \  } _                              j        d|||z  f|d||z  fg          \  } _                              j        d|||z  f|d||z  fg          \  } _                              j         j        k    d j         d j                     j                             d            j        t0          t2          hv rЉ                      j        t6          j        t6          j        t6          j        t6          j        t6          j         gd           _!         j"        xt6          j        k    r                       j        t6          j        t6          j        t6          j        t6          j        t6          j         gd           _#         $                     %                     j#                  d           nt6          j        k    rw                      j        t6          j        t6          j        gd           _#                              j!        t6          j        t6          j        t6          j         gd           n	 tM          d  j"                                         j        t6          j        t6          j        gd           _'        n j        t          t          hv r                      j        d u p j        d u d!                                 j        t6          j(        t6          j)        t6          j         t6          j        gd"           _!                              j"        t6          j        d#           _"                              j        t6          j        t6          j        t6          j        t6          j        t6          j         gd$           _#                              j        t6          j        t6          j        t6          j        t6          j        t6          j         gd%           _'                              *                     j!                  o %                     j'                  d&                                 %                     j'                  o j        d u p j        d u d'                                 *                     j!                  o j'        t6          j        k    o j        d u d(            $                     j'        t6          j        k    o j#        t6          j        k    d)                                 j        d*vd+ j                                          j        t6          j+        t6          j        gd           _,                              j         j,        dd,-                                 j         j,        dd.-            %                     j!                  r6                      j,        t6          j+        k    o
 j        dk     d/           nN *                     j!                  r4                      j,        t6          j        k    o
 j        dk    d0            *                     j!                  rI                      j        d1k    p
 j        d1k    d2                                 j        d1k    d3                                 j         j!        d	d4-            j                             d5                                 j-        d6         d7vd8 j-        d6                      j        t0          t2          hv rA                      j-        d         t]          dd9d          vd: j-        d                     n j        t          t          hv r҉ *                     j!                  rA                      j-        d         t]          d;d9d;          vd< j-        d                     nw %                     j!                  r]                      %                     j'                  p. %                     j#                  p j#        t6          j        k    d=                                 j/        d6          j-        d6         d>k    rdndz  d6k    d?                                 j/        d6          j/        d         z  d@k    oU j/        d6         d6k    oD j/        d         d6k    o3ta           j/        d6                   ota           j/        d                    dA j/        d6          dB j/        d                      j        t0          t2          hv rn j-        d6         d>k    }	                     |	 o
 j/        dCk    dD            j/        dCk    r0 j-        d6         dk    r                      j-        dEk    dF            j                             dG            fdH}
                      |
 j!         j        |||f          o3 |
 j!         j        |||f          o |
 j#         j        |||f           dI            j        t          t          hv r<                     | j-        d6         z  d6k    p| j-        d         z  d6k    dJ            j                             dK           t6          j1        2                                stg          dL          t6          j1        4                                }t6          j1        5                    |          \  }}|dMz  |z   }|dNk     rtg          dO| dP|           |dQk    rtg          dR          dS _6         j                             dT           dSS )UNzEntering check_supportz+Checking tensor shapes, strides, and dtypesr   namer   r   r   ABAB12r,   Cr       r   SFASFB)r   r-   SFCr.   r   )stridez0AB12 and C tensor stride orders must match, got z and zChecking data types)dtyperA   zAB12 (for float32 acc_dtype)zIab12_dtype {torch.float8_e5m2, torch.float8_e4m3fn} is currently disabledzAB12 (for float16 acc_dtype)zA/B (for float16 acc_dtype)zKUnsupported acc_dtype: expected one of {torch.float32, torch.float16}, got z=sfa and sfb must be provided for quantized GEMM swiglu kernelz$A (for quantized GEMM swiglu kernel)z.Accumulator (for quantized GEMM swiglu kernel)z'AB12 (for quantized GEMM swiglu kernel)z$C (for quantized GEMM swiglu kernel)z]Invalid dtype combination: fp4 ab_dtype is not compatible with fp8 c_dtype (recommended bf16)z7sfc and norm_const must be provided when c_dtype is fp8z>amax must be provided when ab_dtype is fp4 and c_dtype is bf16zKfloat32 c_dtype and float32 ab12_dtype currently disabled due to kernel bug>   r   rF   z\sf_vec_size must be 16 or 32 when ab_dtype is {torch.float8_e5m2, torch.float8_e4m3fn}, got z#SFB must have the same dtype as SFA)rK   rA   extra_error_msgz#SFC must have the same dtype as SFAzwInvalid ab_dtype and sf_dtype/sf_vec_size combination: fp8 ab_dtype requires float8_e8m0fnu sf_dtype and 32 sf_vec_sizezInvalid ab_dtype and sf_dtype/sf_vec_size combination: fp4 ab_dtype not supported with float8_e4m3fn sf_dtype and 32 sf_vec_size)r   r   r,   z?Invalid A or B tensor stride: fp4 dtype requires k-major layoutz=Invalid AB12 tensor stride: fp4 dtype requires n-major layoutz A and B must have the same dtypez)Checking MMA tile shape and cluster shaper   )r   r*   zDInvalid MMA tile shape: expected mma_tiler_mn[0] in {128, 256}, got i  zQInvalid MMA tile shape: expected mma_tiler_mn[1] in {32, 64, ..., 224, 256}, got @   zMInvalid MMA tile shape: expected mma_tiler_mn[1] in {64, 128, 192, 256}, got zFor MXFP8 inputs for blockscaled quantized GEMM swiglu kernel, ab12_dtype and c_dtype cannot be FP8. ab12_dtype also cannot be float32r*   z[Invalid cluster shape: cluster_shape_mn[0] must be divisible by 2 if mma_tiler_mn[0] == 256r   zrInvalid cluster shape: expected values to be powers of 2 and cluster_shape_mn[0] * cluster_shape_mn[1] <= 16, got ,r+   zNInvalid cluster shape: cluster_shape must be (1, 1) when use_2cta_instrs=Falser   ziInvalid MMA tile shape: for non-1x1 cluster shape and 128xmma tile shape, mma_tiler_mn must be (128, 128)zChecking tensor alignmentc                 z    |dk    }|rdnd}||         }dt          | j                  j        z  }||z  dk    S )N)r   r   r,   r   r   r   )interpret_uint8_as_fp4x2)r   r9   width)rK   stride_ordertensor_shapeis_mode0_majormajor_mode_idxnum_major_elementsnum_contiguous_elementsr:   s          r<   check_contigous_16B_alignmentzDGemmSwigluSm100.check_support.<locals>.check_contigous_16B_alignmentu  sj    )Y6N"07QQaN!-n!=&,1Nuos  pN  2O  2O  2O  2U  'V#%(??1DDr=   z5Invalid tensor alignment: tensors must be 16B alignedzAInvalid tensor alignment: m and n must be aligned to mma_tiler_mnzChecking environmentzCUDA is not available
   d   z;GemmSwiglu requires SM100+ compute capability, but found SMz on device g   z,cuteDSL GemmSwiglu is not supported on SM103Tz$check_support completed successfully)7r2   r4   _tensor_shaper   r   r   r   _check_tensor_shaper6   r   r   r   r&   r!   r"   r#   r$   r%   _check_tensor_stridea_stride_orderb_stride_orderab12_stride_orderc_stride_order_value_error_ifr   r   _check_dtypetorchfloat16bfloat16float32float8_e4m3fnfloat8_e5m2ab_dtyper   
ab12_dtype_not_implemented_error_if_is_fp8
ValueErrorc_dtypefloat4_e2m1fn_x2uint8	_is_fp4x2float8_e8m0fnusf_dtyper   ranger    r   cudais_availableRuntimeErrorcurrent_deviceget_device_capability_is_supported)r:   mklnn_2rest_krest_n2_use_2cta_instrsrX   devicemajorminorcompute_capabilitys   `              r<   check_supportzGemmSwigluSm100.check_supportr   s   3444HIII$$T]$DD1a$$T]$DD1a$$T%5M$JJ1a&&t}:&FF	3  Aq	3???  Aq	3???  !1Aq!9fEEE  AFADDD<5=
 
 
 hq$*:;;Q??F$$T_r1hq#>N>NPQSY[\6]_deee$$T_r1hq#>N>NPQSY[\6]_deee$$T%5tVDDDxQ0@AA1EEG$$T_r1hq#>N>NPQSZ\]6^`efff$$T%;T<PPP!%!:!:4=STVWYZ]^Y^R_bcefhilmhmanQo!:!p!p4!%!:!:4=STVWYZ]^Y^R_bcefhilmhmanQo!:!p!p4$($=$=d>NYZ\]_`cd_dXehiklnorsnsgtWu$=$v$v!4!!%!:!:4=STVWYZ]`Y`RadgijlmpslsctQu!:!v!v4"d&99qt?Uqq\`\oqq	
 	
 	

 	0111<%-
 
 
 !--MNM'%  . 
 
DM ."U]]]]&*&7&7(!M!M!N!/!- < '8 
' 
'DO 22T_55f    ]]]&*&7&7($}en=; '8 ' 'DO
 %%$}e.A5CTU: &    
 $  &Fuy  vD  &F  &F  G  G  G,,T]5=RWR`Bahk,llDLL\5=
 
 
   4'B4?d+BO  
 !--*K%'	 < . 	 	DM "..mE /  DN
 #// MMN'% ? 0 
 
DO  ,,MMN'% < - 
 
DL   t}--L$,,t|2L2Lo  
   T\**j40G0i4KaeiKiI     ..Q4<5>3QqX\XhlpXpP  
 **-R$/U]2R]  
    0 Dqu  rB  D  D   !--+U-@A .  DM
 m E	     m E	     ||DM** 	$$%*>>Y4CSWYCYZ N    .. $$MU%88ST=MQS=S W  
 ~~dm,, $$'94X8Ky8XU   $$*i7S   	M->	 	 	
 	
 	
 	FGGGa 
2kUYUfghUikk	
 	
 	
 <%-
 
 
   !!$E"c2,>,>>|fjfwxyfz||   
 \5=
 
 
 ~~dm,, 
$$%a(b#r0B0BB|fjfwxyfz||   
 <<.. ((T\22wdll4?6S6SwW[WfjojwWw a  
 	!!$T->q-AS-H-HaPTUUi	
 	
 	
 	%a(4+@+CCrI <)!,q0<)!,q0< "$"7":;;< "$"7":;; w  BF  BW  XY  BZ  w  w  ]a  ]r  st  ]u  w  w		
 		
 		
 <%-
 
 
 #/2c9O  ##G(=(G`   $..43DQ3G33N3N$$%3  
 	6777	E 	E 	E 	E 	E 	--dmT=PSTVWYZR[\\ f11$-ATWXZ[]^V_``f11$/4CY\]_`bc[dee
 D	
 	
 	
 <5=
 
 
   D%a((A-NT5Fq5I1IQ1NS  
 	1222z&&(( 	86777**,,z77??u"RZ%/##   E]o   E   E  }C   E   E  F  F  F$$MNNN!ABBBtr=   current_streamc                    | j                             d           |                     |          }|                                  t	          j        t          j                  }|                     | j	                  }|                     | j
                  }|                     | j	                  }|                     | j
                  }t	          j        |j                  t	          j        d          k    }|p|p|s|o| }|rg| j                             d           | j        t          u rt          | _        n2| j        t           u rt"          | _        nt%          d| j                   d }	| j        t          t          fv rE|                     t'          | j                  | j        d         dk    | j        | j                  }	n`| j        t           t"          fv r4|                     | j        | j        | j        | j        | j                  }	nt%          d| j                   t4          j                                        }
|
                    | j        d         | j        d	         z            }| j        t          u r| j                             d
           t=          j        |	tA          | j!                  tA          | j"                  tA          | j#                  tA          | j$                  | j%        ||          | _&        nY| j        t           u r| j                             d           t=          j        |	tA          | j!        d          tA          | j"        d          tA          | j'        d          tA          | j(        d          tA          | j$        d          tA          | j#        d          | j)        tA          | j)        d          nd | j*        tA          | j*        d          nd | j+        tA          | j+                  nd | j%        ||          | _&        n6| j        t          t"          fv r	| ,                    | j!        d          \  }}}| ,                    | j"        d          \  }}}| ,                    | j#        d          \  }}}| j        t          u r@t=          j        |	|||||||||tA          | j$                  | j%        ||          | _&        nr| j        t"          u r5| ,                    | j$        d          \  }}}| ,                    | j'        d          \  }}}| ,                    | j(        d          \  }}}| ,                    | j)        d          \  }}} | ,                    | j*        d          \  }!}"}#| ,                    | j+        d          \  }$}%}&t=          j        |	fi d|d|d|d|d |d!|d"|d#|d$|d%|d&|d'|d(|d)|d*|d+|d,|d-|d.|d/|d0| d1|!d2|"d3|#d4|$d5|%d6|&d7| j%        d8|d9|| _&        n.t%          d| j                   t%          d| j                   | j                             d:           d S );NzEntering compilez2.10.0z\Running no_dlpack kernel wrapper due to fp4 dtype or fp8 dtype on incompatible torch version!Unreachable: invalid kernel type r   r*   r   r   r   r    r&   r   r    r'   r(   r   zCompiling gemm_swiglu (dlpack)abab12cr   max_active_clustersstreamz4Compiling gemm_swiglu_blockscaled_quantized (dlpack)r   assumed_align   a_tensorb_tensor
sfa_tensor
sfb_tensorc_tensorab12_tensoramax_tensor
sfc_tensornorm_const_tensorr   r   r   rB   r@   rC   rD   a_ptra_shapea_orderb_ptrb_shapeb_orderab12_ptr
ab12_shape
ab12_orderc_cuter   r   r   rE   rG   rH   AMAXrI   
NORM_CONSTr   r   r   r   r   r   sfa_ptr	sfa_shape	sfa_ordersfb_ptr	sfb_shape	sfb_orderc_ptrc_shapec_orderr   r   r   amax_ptr
amax_shape
amax_ordersfc_ptr	sfc_shape	sfc_ordernorm_const_ptrnorm_const_shapenorm_const_orderr   r   r   zKernel compiled successfully)-r2   r4   _get_default_stream_ensure_support_checkedr   parsere   __version__rs   rk   rl   rn   base_versionr6   r   r   r   r   NotImplementedErrorr   r   r   r    r&   r'   r(   cutlassutilsHardwareInfoget_max_active_clusterscutecompiler   r   r   r   r   r   _compiled_kernelr!   r"   r#   r$   r%   _make_cute_tensor_descriptor)'r:   r   torch_version	is_ab_fp4is_ab12_fp4	is_ab_fp8is_ab12_fp8_fp8_dlpack_supporteduse_no_dlpack_kernelgemm_swigluhardware_infor   r   r   r_   r   r   r`   r   r   ra   r   r   rb   r   r   sfa_stride_orderr   r   sfb_stride_orderr   r   amax_stride_orderr   r   sfc_stride_orderr   r   norm_const_stride_orders'                                          r<   r   zGemmSwigluSm100.compile  s   -...11.AA$$&&&e&788NN4=11	nnT_55LL//	ll4?33 'm.H I IW][cMdMd d(uKuY=U+<t_t[t 	^L}~~~|888@!JJJP)*\dl*\*\]]]<%-
 
 
 ,,7GG!%!21!5!<!.!%!6	 '  KK \5=
 
 
 ,, ,!.!%!6? , '  KK &&X$,&X&XYYY2244+CCDDYZ[D\_c_tuv_wDwxx<444L?@@@$(Ldm,,dm,, !122dm,,j$7%	% 	% 	%D!! \FFFLUVVV$(L$T]"EEE$T]"EEE&tbIII&tbIII$T]"EEE'(8JJJPTP`Pl[)9LLLLrvNRoNiKrJJJJosJNJ`Jl;t/E#F#F#Frvj$7%% % %D!! \-=
 
 
 .2-N-Nt}cf-N-g-g*E7N-1-N-Nt}cf-N-g-g*E7N6:6W6WX\Xhou6W6v6v3Hj"3|@@@(,#*#*%)0&t}55*(;)) ) )%%  !RRR151R1RSWS`gj1R1k1k.w7;7X7XY]Yhot7X7u7u4$47;7X7XY]Yhot7X7u7u4$4:>:[:[\`\lsy:[:z:z7*&77;7X7XY]Yhot7X7u7u4$4LPLmLmnr  oE  LXLm  MY  MYI 02I(, )  )  )% ) $G ) +N	 )
  % ) $G ) +N ) $G ) (i ) /. ) $G ) (i ) /. )  % ) $G )  +N! )" &X# )$  *z% )&  10' )( &X) )*  *z+ ),  10- ). $G/ )0 (i1 )2 /.3 )4 $2>5 )6 &6%57 )8 &=%<9 ): **; )< )<(;= )> *>? )%%D **\dl*\*\]]]%&X$,&X&XYYY9:::::r=   r   r   r   r   r   r   r   r   r   skip_compilec                    | j                             d           |                     |          }|sa|                     | j        d u d           | j                             d           | j        t          u rQ|                     t          |          t          |          t          |          t          |          |
|           n| j        t          u r| 	                    |dd          }| 	                    |	dd          }	|                     t          |d	          t          |d	          t          |d	          t          |d	          t          |d	          t          |d
	          |t          |d	          nd |t          |d	          nd |	t          |	          nd |
|           n| j        t          t          fv r|                     |d	          }|                     |d	          }|                     |d	          }| j        t          u r*|                     |||t          |          |
|           n-| j        t          u r| 	                    |dd          }| 	                    |	dd          }	|                     |d	          }|                     |d	          }|                     |d	          }|                     |d	          }|                     |d	          }|                     |	          }|                     ||||||||||
|           nHt          dt          | j                             t          dt          | j                             | j                             d           d S | j                             d           | j        t          u r|                     t          | j                  | j        d         dk    | j        | j                  } |t          |          t          |          t          |          t          |          |
t&          j                                                            | j        d         | j        d         z            |           n| j        t          u r|                     t          | j                  | j        d         dk    | j        | j                  }|                     |d          \  }}}|                     |d          \  }}}|                     |d          \  }}} ||||||||||t          |          |
t&          j                                                            | j        d         | j        d         z            |           n| j        t          u rS|                     | j        | j        | j        | j        | j                  }| 	                    |dd          }| 	                    |	dd          }	 |t          |d	          t          |d	          t          |d	          t          |d	          t          |d	          t          |d
	          |t          |d	          nd |t          |d	          nd |	t          |	          nd |
t&          j                                                            | j        d         | j        d         z            |           n0| j        t          u r|                     | j        | j        | j        | j        | j                  }| 	                    |dd          }| 	                    |	dd          }	|                     |d          \  }}}|                     |d          \  }}}|                     |d          \  }}}|                     |d          \  }}}|                     |d          \  }}} |                     |d          \  }}!}"|                     |d          \  }}#}$|                     |d           \  }}%}&|                     |	d!          \  }}'}( |dAi d"|d#|d$|d%|d&|d'|d(|d)|d*| d+|d,|!d-|"d.|d/|d0|d1|d2|d3|d4|d5|#d6|$d7|d8|%d9|&d:|d;|'d<|(d=|
d>t&          j                                                            | j        d         | j        d         z            d?| n$t          dt          | j                             | j                             d@           d S )BNzEntering executez[GemmSwigluSm100 kernel not compiled; call compile() first or use execute(skip_compile=True)zExecuting with compiled kernel)r   r   r   r   r   r   r   r-   r.   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   z*Executed with compiled kernel successfullyz'Executing without compiled kernel (JIT)r   r*   r   r   rB   r@   rC   rD   r   r   r   rE   rG   rH   r   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z3Executed without compiled kernel (JIT) successfully )r2   r4   r   _runtime_error_ifr   r6   r   r   r   r5   r   r   _make_cute_pointerr   typer   r   r   r    r   r   r   r   r   r&   r'   r(   ))r:   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r_   r   r`   r   ra   r   rb   r   r   r   r   r   r   r   r   r   r   s)                                            r<   executezGemmSwigluSm100.execute6  s    	-...11.AA I	V""%-m   L?@@@|888%%!(++!(++$[11!(++) &     !JJJ"88aPP$($>$>?PRSUa$b$b!%%(DDD(DDD*:RHHH*:RHHH(DDD +Kq I I IOZOf[!K!K!K!KlpMWMcJb I I I IimIZIf{3D'E'E'Elp) &     1A"   ///KK///KK22;b2QQ<#DDD))##!)*844#- *     \%VVV"&"<"<[!V"T"TK(,(B(BCTVWYe(f(f% 33HB3OOE"55jPR5SSG"55jPR5SSG#66{RT6UUH"55jPR5SSG%)%<%<=N%O%ON))## ' '#!)!) ''5#- *     ..oRVW[WlRmRm.o.oppp)*kdSWShNiNi*k*klllLKLLLLLLHIII|888"ll;DNKK%)%6q%9S%@!%!2%)%:	 +   !(++!(++$[11!(++(/(B(B(D(D(\(\]a]rst]ux|  yN  OP  yQ  ^Q  )R  )R)     !BBB"ll;DNKK%)%6q%9S%@!%!2%)%:	 +   261R1RS[be1R1f1f.w151R1RS[be1R1f1f.w:>:[:[\gnt:[:u:u7*&7#*#*%)0&x00(/(B(B(D(D(\(\]a]rst]ux|  yN  OP  yQ  ^Q  )R  )R)     !JJJ"ll $ 0!%!2%)%:# $ 0 +   #88aPP$($>$>?PRSUa$b$b!(DDD(DDD*:RHHH*:RHHH(DDD +Kq I I IOZOf[!K!K!K!KlpMWMcJb I I I IimIZIf{3D'E'E'Elp(/(B(B(D(D(\(\]a]rst]ux|  yN  OP  yQ  ^Q  )R  )R)     !RRR"ll $ 0!%!2%)%:# $ 0 +   #88aPP$($>$>?PRSUa$b$b!151R1RS[be1R1f1f.w151R1RS[be1R1f1f.w:>:[:[\gnt:[:u:u7*&7151R1RS[be1R1f1f.w7;7X7XYcjo7X7p7p4$47;7X7XYcjo7X7p7p4$4:>:[:[\gnt:[:u:u7*&77;7X7XYcjo7X7p7p4$4LPLmLmn  GSLm  MT  MTI 02I   %#G +N  %	
 $G +N $G (i /. $G (i /.  % $G +N  &X!"  *z#$  10%& &X'(  *z)*  10+, $G-. (i/0 /.12 $2>34 &6%556 &=%<78  %9: )0(B(B(D(D(\(\]a]rst]ux|  yN  OP  yQ  ^Q  )R  )R  )R;< *>=  B **bdSWS_N`N`*b*bcccLTUUUUUr=   )N)NNNNNr   NF)__name__
__module____qualname__re   rh   TensorfloatrK   r	   intr
   boolr1   r   rw   CUstreamr   r   __classcell__)r;   s   @r<   r   r   4   s        !&(26:-1-1.2-148 %;. ;.,;. ,;. \	;.
 ,;. ;. ;;. CHo;. #5c?3;. U\*;. U\*;. el+;. U\*;. $EL1;.  !;." #;.$ %;. ;. ;. ;. ;. ;.zit i i i iV	W; W;ht}&= W; W; W; W; W;~ .2-1.2-14826"[V [V,[V ,[V \	[V
 ,[V U\*[V U\*[V el+[V U\*[V $EL1[V [V !/[V [V 
[V [V [V [V [V [V [V [Vr=   r   r   r   r   r   Fr   r   r   r   c_majorrl   rp   r   r   r    r   r   r   r&   r'   r(   r   r>   .c                 D   t                               d           | j        \  }}}|j        \  }}}d\  }}|dk    rSt          j        |||fd|||z  f|| j                  }t          j        ||dz  |fd|||z  dz  f|| j                  }nn|dk    rVt          j        |||f|d||z  f|| j                  }t          j        ||dz  |f|dz  d||z  dz  f|| j                  }nt          d|           d\  }}|	A|
>t                               d	           |t          j        t          j        hv rt                               d
           t          |dz  |          }|t          |d          t          |d          dddf}d}t          j
        |t          j        | j                                      |          }| j        t          j        t          j        hv r_|t          j        k    rOt                               d           t          j        dt%          d           | j        t          j                  }| j        |j        | j        |j        |                                 |                                ||||||||	|	j        nd |
|
j        nd |	|	                                nd |
|
                                nd |	|	j        nd |
|
j        nd ||j        nd ||                                nd ||j        nd |||f}|t*          v rHt                               d           t*          |         }|                    | ||||	|
|||||           nt                               d           t/          d(i d| d|d|d|d|d|d|d|d|	d|
d |d!|d"|d#|d$|d%|}|                                s
J d&            |                    |'           |                    | ||||	|
|||||           |t*          |<   |	|
||||fS ||fS ))NzCgemm_swiglu_wrapper_sm100: Creating empty output tensors ab12 and c)NNr}   r   )rK   r   r,   r   z'c_major must be either 'm' or 'n', got zdgemm_swiglu_wrapper_sm100: Detected sfa_tensor and sfb_tensor, constructing quantized output tensorszHgemm_swiglu_wrapper_sm100: Detected fp8 c_dtype, constructing sfc_tensorr   r   rF   )   r   r      r,   r   z[gemm_swiglu_wrapper_sm100: Detected fp4 ab_dtype and bf16 c_dtype, constructing amax_tensor)r   r   r   inf)r   rK   zIgemm_swiglu_wrapper_sm100: Using previously cached GemmSwigluSm100 object)r   r   r   r   r   r   r   r   r   r   r   zqgemm_swiglu_wrapper_sm100: No previously cached GemmSwigluSm100 object found, creating new GemmSwigluSm100 objectr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   zUnsupported testcase)r   r   )r2   r4   r8   re   empty_stridedr   ro   rj   ri   r   emptyrt   permuterK   rq   rr   rg   fullr   rh   rJ    _cache_of_GemmSwigluSm100Objectsr   r   r   r   )r   r   r   r   rl   rp   r   r   r    r   r   r   r&   r'   r(   r   r}   r~   r   r   r   r   r   r   sf_k	mma_shapemma_permute_order	cache_keyr   s                                r<   gemm_swiglu_wrapper_sm100r     s|   ( MMWXXXnGAq!nGAq!&K#~~)1a)aAE]*]e]lmmm&161~1a!eqj7IQXaiapqqq	C)1a)aAE]*]e]lmmm&QN!VQA
#?	
 
 
 L7LLMMM(J*"8|}}}u(%*=>>>MMdeeeAFK00DC  q!!I !3*   g'((	 
 >e4ekBBBwRWR`G`G`MMwxxx*Yuho]b]jkkkK 	&2
&2
)5
4)5
4&2
&2
#4#@d&7&C  """#4#@d3I6 444abbb6yA#!!#!/! 	 	
 	
 	
 	
 	  J  	K  	K  	K% 
 
 
X
X
 $
 X	

 %
  i
 &
 .-
 "z
 "z
 $
 "z
 0/
 $
 "z
  $!
$ ((**BB,BBB*6222#!!#!/! 	 	
 	
 	
 7B(3*"8Hj+==H$$r=   )-dense_gemm_persistent_swiglur   r   :dense_blockscaled_gemm_persistent_swiglu_interleaved_quantr   r   cuda.bindingsr   rw   re   typingr	   r
   r   cutlass.cuter   cutlass.cute.runtimer   r   	packagingr   cutlass.cute.mathmathcudnn.datatypesr   cudnn.api_baser   r   r   r   logging	getLoggerr   r2   r   rh   rf   r   r   strrK   r   r   r   r   r   r=   r<   <module>r     s  <               ) ( ( ( ( (  " " " " " " " "        6 6 6 6 6 6 6 6                         9 9 9 9 9 9 ; ; ; ; ; ; ; ; ; ;]V ]V ]V ]V ]Vg ]V ]V ]V@ 
'
H
%
%#%   #m ="]$.26)-)-04&*#R% R%lR%lR% R% 	R%
 R% [R% {R% S/R% uS#X/R% &R% &R%  -R% R% R%  !R%" T]##R%$ 5<%R% R% R% R% R% R%r=   