
    )`i                    +   d Z ddlZddlZ	 ddlmZ n# e$ r dadefdZY nw xY w G d dej                  Z	e	j
        diZ G d	 d
ej                  Zej        dej        dej        dej        dej        dej        dej        dej        dej        dej        dej        diZi ej        dej        dej        dej        dej        dej        dej        dej         dej!        dej"        dej#        d ej$        d!ej        d"ej%        d#ej        dej        dej        di ej        dej        dej&        d$ej'        d%ej(        d&ej)        d'ej*        d(ej        d)ej+        d*ej        d+ej,        d,ej        d-ej-        d.ej.        d/ej        d0ej/        d1ej        d2ej0        d3ej1        d4ej2        d5ej3        d6ej4        d7ej5        d8ej6        d9ej7        d:ej8        d;ej9        d<ej:        d=ej;        d>iZ<i ej        dej        d?ej        d@ej        dAej        dBej        dCej        dDej         dEej!        dFej"        dGej#        dHej$        dIej        dJej%        dKej        dLej        dMej        dNi ej        dOej        dPej&        dQej'        dRej(        dSej)        dTej*        dUej        dVej+        dWej        dXej,        dYej        dZej-        d[ej.        d\ej        d]ej/        d^ej        d_ej0        d`ej1        daej2        dbej3        dcej4        ddej5        deej6        dfej7        dgej8        dhej9        diej:        djej;        dkiZ=i ej        dej        dlej        dmej        dnej        doej        dpej        dqej         drej!        dmej"        dnej#        doej$        dpej        dqej%        drej        doej        doej        doi ej        dsej        dnej&        dsej'        dsej(        dnej)        doej*        doej        dpej+        dpej        dqej,        dqej        drej-        dqej.        dqej        drej/        dqej        dtej0        dnej1        doej2        dpej3        dqej4        drej5        dtej6        dnej7        doej8        dpej9        dqej:        drej;        dtiZ> G du dvej                  Z?e?j@        dwe?jA        dxiZB G dy dzej                  ZCeCjD        d{eCjE        d|iZFeCjD        d}eCjE        d~iZGej        ej-        fej        ej        fej        ej        fgZHd ZId ZJd ZKd ZLd ZMd ZNd ZO G d dej                  ZP G d dej                  ZQeQjR        deQjS        deQjT        deQjU        deQjV        deQjW        deQjX        deQjY        deQjZ        deQj[        deQj\        deQj]        diZ^ G d dej                  Z_i e_j`        de_ja        de_jb        de_jc        de_jd        de_je        de_jf        de_jg        de_jh        de_ji        de_jj        de_jk        de_jl        de_jm        de_jn        de_jo        de_jp        de_jq        de_jr        de_js        diZte_j`        e_ja        e_ja        e_j`        e_jb        e_jc        e_jc        e_jb        e_jd        e_je        e_je        e_jd        e_jf        e_jg        e_jg        e_jf        e_ji        e_ji        i	Zui e_j`        de_jb        de_jd        de_jf        de_ja        de_jc        de_je        de_jg        de_jh        de_ji        de_jj        de_jk        de_jl        de_jm        de_jo        de_jn        de_jp        de_jq        de_jr        de_js        diZve_j`        eCjD        fde_j`        eCjE        fde_ja        eCjD        fde_ja        eCjE        fdiZw G d dej                  Zxi exjy        dexjz        dÓexj{        dēexj|        dœexj}        dƓexj~        dǓexj        dȓexj        dɓexj        dʓexj        d˓exj        d̓exj        d͓exj        dΓexj        dϓexj        dГexj        dѓexj        dғi exj        dӓexj        dԓexj        dՓexj        d֓exj        dדexj        dؓexj        dٓexj        dړexj        dۓexj        dܓexj        dݓexj        dޓexj        dߓexj        dexj        dexj        dexj        di exj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        di exj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        d exj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        d	exj        d
exj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        diZi exjy        dexjz        dexj{        dexj|        dexj}        dexj~        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        dexj        d exj        d!i exj        d exj        d!exj        d exj        d!exj        d exj        d!exj        d exj        d!exj        d"exj        d#exj        d exj        d!exj        d exj        d!exj        d$exj        d%exj        d&i exj        d'exj        d(exj        d)exj        d*exj        d+exj        d,exj        d-exj        d.exj        d/exj        d0exj        d1exj        d2exj        d3exj        dexj        dexj        dexj        di exj        dexj        dexj        d exj        d!exj        d&exj        d'exj        d$exj        d%exj        d$exj        d%exj        d(exj        d)exj        d*exj        d+exj        d,exj        d-exj        d.exj        d/exj        d0exj        d1exj        d2exj        d3exj        d4exj        d5exj        d6exj        d7exj        d8exj        d9exj        d:exj        d4exj        d5iZ G d; d<ej                  Zi ejy        d=ej        d>ej        d?ej        d@ej        dAej        dBej        dCej        dDej        dEej        dFej        dGej        dHej        dIej        dJej        dKej        dLej        dMej        dNej        dOej        dPej        dQej        dRej        dSej        dTiZi ejy        dej        dej        dUej        dUej        dUej        dUej        dVej        dVej        dUej        dUej        dUej        dUej        dVej        dVej        dUej        dUej        dWej        dWej        dej        dWej        dej        dWej        dWej        dWiZ G dX dYej                  Zej        dZej        d[iZd\ Zd] Z G d^ d_ej                  Zej        dej        d`ej        daiZej        dej        dej        dbiZ G dc ddej                  Zej        deej        dfiZej        dgej        dhiZ G di djej                  Zej        dkej        dliZej        dmej        dniZ G do dpej                  Zej        dqej        driZej        dsej        dtiZ G du dvej                  Zej        dwej         dxej        dyej        dzej        d{iZej        d|ej         d}ej        d~ej        dej        diZ G d dej                  Zej        dej        dej	        dej
        dej        dej        dej        diZ G d dej                  ZdddddddddZdddrddddddd	Zd Z G d dej                  Zej        dej        dej        dej        dej        dej        dej        dej        dej        dej        dej        dej        dej        diZ  G d dej                  Z!e!j        diZ" G d dej                  Z#e#j        diZ$ G d dej                  Z%e%j        diZ& G d dej                  Z'e'j        de'j(        diZ) G d dej                  Z* G d dej                  Z+e+j,        de+j-        de+j.        de+j/        de+j0        de+j1        de+j2        de+j3        de+j        di	Z4 G d dej                  Z5e5j6        de5j7        diZ8e5j6        de5j7        diZ9 G d dej:                  Z;e;j<        de;j=        de;j>        diZ?e;j<        de;j=        dÐe;j>        diZ@ G dń dej:                  ZA G dǄ dej                  ZBeBjC        dɐeBjD        dʐeBjE        dːeBjF        d̐eBjG        diZHeBjC        dΐeBjD        dϐeBjE        dАeBjF        dѐeBjG        diZI G dӄ dej                  ZJeJjK        dՐeJjL        d֐eJjM        diZNeJjK        deJjL        dؐeJjM        diZO G dڄ dej                  ZPePjQ        dܐePjR        dݐePjS        dސePjT        diZUePjQ        dePjR        dePjS        dePjT        diZVg dZW G d d          ZX G d d          ZY G d d          ZZ G d d          ZZ G d d          Z[ G d d          Z\ G d d          Z]d Z^ G d dej:                  Z_ G d dej:                  Z`dS (  z;
Data types and tags used for emitting CUTLASS C++ kernels
    N)autoreturnc                  (    t           } t           dz  a | S )N   )__cutlass_library_auto_enum)is    /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/data/cutlass/python/cutlass_library/library.py	enum_autor
   1   s    #A1$H    c                   "    e Zd Z e            ZdS )GeneratorTargetN)__name__
__module____qualname__r
   Library r   r	   r   r   :           IKK'''r   r   libraryc                      e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z  e            Z! e            Z" e            Z# e            Z$ e            Z% e            Z& e            Z' e            Z( e            Z) e            Z* e            Z+ e            Z, e            Z- e            Z. e            Z/ e            Z0 e            Z1 e            Z2dS )DataTypeN)3r   r   r   r
   voidb1u2u4u8u16u32u64s2s4s8s16s32s64e4m3e5m2f8f6f4e3m2e2m3e2m1ue8m0ue4m3f16bf16f32tf32f64cf16cbf16cf32ctf32cf64cs2cs4cs8cs16cs32cs64cu2cu4cu8cu16cu32cu64invalidr   r   r	   r   r   E   s       	$y{{"y{{"y{{"y{{"	#	#	#y{{"y{{"y{{"	#	#	#	$	$y{{"y{{"y{{"	$	$	$
)++%
)++%	#	$	#	$	#	$
)++%	$
)++%	$	#	#	#	$	$	$	#	#	#	$	$	$IKK'''r   r   r   r%   r&   hsdczr'   r(   r)   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r+   r*   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r?   r@   rA   rB   rC   rD   r9   r:   r;   r<   r=   r>   zcutlass::uint1b_tzcutlass::uint2b_tzcutlass::uint4b_tuint8_tuint16_tuint32_tuint64_tzcutlass::int2b_tzcutlass::int4b_tint8_tint16_tint32_tint64_tzcutlass::float_e4m3_tzcutlass::float_e5m2_tz%cutlass::type_erased_dynamic_float8_tz%cutlass::type_erased_dynamic_float6_tz%cutlass::type_erased_dynamic_float4_tzcutlass::float_e2m3_tzcutlass::float_e3m2_tzcutlass::float_e2m1_tzcutlass::float_ue8m0_tzcutlass::float_ue4m3_tzcutlass::half_tzcutlass::bfloat16_tfloatzcutlass::tfloat32_tdoublez!cutlass::complex<cutlass::half_t>z%cutlass::complex<cutlass::bfloat16_t>zcutlass::complex<float>z%cutlass::complex<cutlass::tfloat32_t>zcutlass::complex<double>z#cutlass::complex<cutlass::uint2b_t>z#cutlass::complex<cutlass::uint4b_t>z"cutlass::complex<cutlass::uint8_t>z#cutlass::complex<cutlass::uint16_t>z#cutlass::complex<cutlass::uint32_t>z#cutlass::complex<cutlass::uint64_t>z"cutlass::complex<cutlass::int2b_t>z"cutlass::complex<cutlass::int4b_t>z!cutlass::complex<cutlass::int8_t>z"cutlass::complex<cutlass::int16_t>z"cutlass::complex<cutlass::int32_t>z"cutlass::complex<cutlass::int64_t>r                   @         c                   6    e Zd Z e            Z e            ZdS )BlasModeN)r   r   r   r
   	symmetric	hermitianr   r   r	   r^   r^     s$        ikk)ikk)))r   r^   zcutlass::BlasMode::kSymmetriczcutlass::BlasMode::kHermitianc                   6    e Zd Z e            Z e            ZdS )ComplexTransformN)r   r   r   r
   noneconjr   r   r	   rb   rb   &  s$        	$	$$$r   rb   z cutlass::ComplexTransform::kNonez%cutlass::ComplexTransform::kConjugatezcute::identityzcute::conjugatec                 2    t           D ]\  }}| |k    r dS dS )NTF)RealComplexBijection)	data_typerrI   s      r	   
is_complexri   >  s/    "  daA~~TT 	r   c                 6    | t           j        t           j        fv S N)GemmKindBlockScaledUniversal3xGroupedBlockScaledUniversal3x	gemm_kinds    r	   is_block_scaledrq   D  s    	x68^_	__r   c                 6    | t           j        t           j        fv S rk   )rl   BlockwiseUniversal3xGroupedBlockwiseUniversal3xro   s    r	   is_blockwiseru   G  s    	x4h6Z[	[[r   c                 L    | t           j        t           j        t           j        fv S rk   )rl   GroupedUniversal3xrn   rt   ro   s    r	   
is_groupedrx   J  s'    	x2*H,PR 
R Rr   c                 H    t           D ]\  }}| |k    r|c S t          j        S rk   rf   r   rE   )	real_typerh   rI   s      r	   get_complex_from_realr|   O  s5    "  daA~~hhh 		r   c                 H    t           D ]\  }}| |k    r|c S t          j        S rk   rz   )complex_typerh   rI   s      r	   get_real_from_complexr   V  s7    "  daqhhh 		r   c                 l    | t           j        k    rdS t          |          dk    rdS dt          |          z  S )Nr   r[   r\   )r   r   DataTypeSize)rg   s    r	   get_tma_alignmentr   ]  s9    (-1I!##3,y)))r   c                   6    e Zd Z e            Z e            ZdS )ComplexMultiplyOpN)r   r   r   r
   multiply_addgaussianr   r   r	   r   r   f  s$        ,Y[[(((r   r   c                       e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            Z e            Z e            Z e            ZdS )MathOperationN)r   r   r   r
   r   multiply_add_saturatemultiply_add_mixed_input_upcastxor_popcand_popcmultiply_add_fast_bf16multiply_add_fast_f16multiply_add_fast_f32multiply_add_complex_fast_f32multiply_add_complexmultiply_add_complex_gaussianmultiply_add_fast_accumr   r   r	   r   r   m  s        ,#)++$-IKK!Y[[(Y[[($9;;#)++#)++"+)++""+)++%IKKr   r   zcutlass::arch::OpMultiplyAddz$cutlass::arch::OpMultiplyAddSaturatez,cutlass::arch::OpMultiplyAddMixedInputUpcastzcutlass::arch::OpXorPopczcutlass::arch::OpAndPopcz$cutlass::arch::OpMultiplyAddFastBF16z#cutlass::arch::OpMultiplyAddFastF16z#cutlass::arch::OpMultiplyAddFastF32z*cutlass::arch::OpMultiplyAddComplexFastF32z#cutlass::arch::OpMultiplyAddComplexz+cutlass::arch::OpMultiplyAddGaussianComplexz%cutlass::arch::OpMultiplyAddFastAccumc                      e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            ZdS )
LayoutTypeN)r   r   r   r
   ColumnMajorRowMajorColumnMajorInterleaved2RowMajorInterleaved2ColumnMajorInterleaved32RowMajorInterleaved32ColumnMajorInterleaved64RowMajorInterleaved64	TensorNWC
TensorNHWCTensorNDHWC
TensorNCHWTensorNGHWCTensorNC32HW32TensorNC64HW64TensorC32RSK32TensorC64RSK64	TensorKCS
TensorKCSRTensorKCSRTr   r   r	   r   r     s        	+Y[[(%IKK"&Y[[#)++&Y[[#)++ikk)y{{*	+y{{*	+9;;.9;;.9;;.9;;.ikk)y{{*	+++r   r   zcutlass::layout::ColumnMajorzcutlass::layout::RowMajorz*cutlass::layout::ColumnMajorInterleaved<2>z'cutlass::layout::RowMajorInterleaved<2>z+cutlass::layout::ColumnMajorInterleaved<32>z(cutlass::layout::RowMajorInterleaved<32>z+cutlass::layout::ColumnMajorInterleaved<64>z(cutlass::layout::RowMajorInterleaved<64>zcutlass::layout::TensorNWCzcutlass::layout::TensorNHWCzcutlass::layout::TensorNDHWCzcutlass::layout::TensorNCHWzcutlass::layout::TensorNGHWCz!cutlass::layout::TensorNCxHWx<32>z!cutlass::layout::TensorCxRSKx<32>z!cutlass::layout::TensorNCxHWx<64>z!cutlass::layout::TensorCxRSKx<64>zcutlass::layout::TensorKCSzcutlass::layout::TensorKCSRzcutlass::layout::TensorKCSRTnn2n32n64tt2t32t64nwcnhwcndhwcnchwnghwcnc32hw32nc64hw64c32rsk32c64rsk64kcskcsrkcsrtc                   v   e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z  e            Z! e            Z" e            Z# e            Z$ e            Z% e            Z& e            Z' e            Z( e            Z) e            Z* e            Z+ e            Z, e            Z- e            Z. e            Z/ e            Z0 e            Z1 e            Z2 e            Z3 e            Z4 e            Z5 e            Z6 e            Z7 e            Z8 e            Z9 e            Z: e            Z; e            Z< e            Z= e            Z> e            Z? e            Z@ e            ZA e            ZB e            ZC e            ZD e            ZE e            ZF e            ZG e            ZH e            ZI e            ZJ e            ZK e            ZL e            ZM e            ZN e            ZO e            ZP e            ZQ e            ZR e            ZS e            ZT e            ZUdS )KernelScheduleTypeN)Vr   r   r   r
   ScheduleAuto
MultistageCpAsyncWarpSpecializedCpAsyncWarpSpecializedPingpong!CpAsyncWarpSpecializedCooperativeTmaTmaWarpSpecializedTmaWarpSpecializedPingpongTmaWarpSpecializedCooperativeTmaWarpSpecializedFP8FastAccum)TmaWarpSpecializedCooperativeFP8FastAccum&TmaWarpSpecializedPingpongFP8FastAccumImplicitTmaWarpSpecializedSm90%PtrArrayTmaWarpSpecializedCooperative1PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum"PtrArrayTmaWarpSpecializedPingpong.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum&BlockwiseTmaWarpSpecializedCooperative.PtrArrayBlockwiseTmaWarpSpecializedCooperative#BlockwiseTmaWarpSpecializedPingpong+PtrArrayBlockwiseTmaWarpSpecializedPingpongTmaWarpSpecialized1SmSm100TmaWarpSpecialized2SmSm100"ImplicitTmaWarpSpecialized1SmSm100"ImplicitTmaWarpSpecialized2SmSm100"PtrArrayTmaWarpSpecialized1SmSm100"PtrArrayTmaWarpSpecialized2SmSm100-PtrArrayTmaWarpSpecialized1SmBlockScaledSm100-PtrArrayTmaWarpSpecialized2SmBlockScaledSm100&PtrArrayNvf4TmaWarpSpecialized1SmSm100&PtrArrayNvf4TmaWarpSpecialized2SmSm100&PtrArrayMxf4TmaWarpSpecialized1SmSm100&PtrArrayMxf4TmaWarpSpecialized2SmSm100*PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100*PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100 SparseTmaWarpSpecialized1SmSm100 SparseTmaWarpSpecialized2SmSm100%BlockScaledTmaWarpSpecialized1SmSm100%BlockScaledTmaWarpSpecialized2SmSm100"Mxf8f6f4TmaWarpSpecialized1SmSm100"Mxf8f6f4TmaWarpSpecialized2SmSm100#BlockwiseTmaWarpSpecialized1SmSm100#BlockwiseTmaWarpSpecialized2SmSm100+PtrArrayBlockwiseTmaWarpSpecialized1SmSm100+PtrArrayBlockwiseTmaWarpSpecialized2SmSm100Mxf4TmaWarpSpecialized1SmSm100Mxf4TmaWarpSpecialized2SmSm100Nvf4TmaWarpSpecialized1SmSm100Nvf4TmaWarpSpecialized2SmSm100)MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103)MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103)MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103)MxNvf4UltraTmaWarpSpecialized2SmVs32Sm1038MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch8MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch8MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch8MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch4MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch4MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch4MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch4MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch1PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm1031PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm1031PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm1031PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103@PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch@PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch@PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch@PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch<PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch<PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch<PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch<PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch*Mxf8f6f4TmaWarpSpecializedCooperativeSm120'Mxf8f6f4TmaWarpSpecializedPingpongSm120&Nvf4TmaWarpSpecializedCooperativeSm120#Nvf4TmaWarpSpecializedPingpongSm120&Mxf4TmaWarpSpecializedCooperativeSm120#Mxf4TmaWarpSpecializedPingpongSm120.F8f6f4SparseTmaWarpSpecializedCooperativeSm120+BlockwiseTmaWarpSpecializedCooperativeSm120(BlockwiseTmaWarpSpecializedPingpongSm120r   r   r	   r   r     s       ,y{{*$9;;#,9;; &/ikk#	# y{{(y{{"+)++#,9;; .7ikk++49;;(#,9;; *3)++'6?ikk3'0y{{$3<9;;0+49;;(3<9;;0(1	%09	-(y{{(y{{'0y{{$'0y{{$'0y{{$'0y{{$2;)++/2;)++/+49;;(+49;;(+49;;(+49;;(/8y{{,/8y{{,%.Y[["%.Y[["*3)++'*3)++''0y{{$'0y{{$(1	%(1	%09	-09	- $-9;; #,9;; #,9;; #,9;;  /8ikk+.7ikk+.7ikk+.7ikk+=FY[[:=FY[[:=FY[[:=FY[[:9B69B69B69B66?ikk36?ikk36?ikk36?ikk3ENY[[BENY[[BENY[[BENY[[BAJ>AJ>AJ>AJ>/8y{{,,5IKK)+49;;((1	%+49;;((1	%3<9;;009	--6Y[[***r   r   z-cutlass::gemm::collective::KernelScheduleAutozcutlass::gemm::KernelMultistagez+cutlass::gemm::KernelCpAsyncWarpSpecializedz3cutlass::gemm::KernelCpAsyncWarpSpecializedPingpongz6cutlass::gemm::KernelCpAsyncWarpSpecializedCooperativezcutlass::gemm::KernelTmaz'cutlass::gemm::KernelTmaWarpSpecializedz/cutlass::gemm::KernelTmaWarpSpecializedPingpongz2cutlass::gemm::KernelTmaWarpSpecializedCooperativez3cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccumz>cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccumz;cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccumz3cutlass::conv::KernelImplicitTmaWarpSpecializedSm90z>cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8Blockwisez;cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8Blockwisez/cutlass::gemm::KernelTmaWarpSpecialized1SmSm100z/cutlass::gemm::KernelTmaWarpSpecialized2SmSm100z7cutlass::conv::KernelImplicitTmaWarpSpecialized1SmSm100z7cutlass::conv::KernelImplicitTmaWarpSpecialized2SmSm100z7cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100z7cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100z5cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100z5cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100z:cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100z:cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100z7cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100z7cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100z8cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100z8cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100z@cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100z@cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise2SmSm100z3cutlass::gemm::KernelTmaWarpSpecialized1SmMxf4Sm100z3cutlass::gemm::KernelTmaWarpSpecialized2SmMxf4Sm100z3cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100z3cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100zIcutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103zIcutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103zIcutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103zIcutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103zTcutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetchzTcutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetchzTcutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetchzTcutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetchzXcutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetchzXcutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetchzXcutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetchzXcutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetchz:cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativezFcutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccumz7cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongzCcutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccumzFcutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockwisezCcutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockwisezBcutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100zBcutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100z;cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100z;cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100z;cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100z;cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100z?cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100z?cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100zQcutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103zQcutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103zQcutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103zQcutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103z\cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetchz\cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetchz\cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetchz\cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetchz`cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetchz`cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetchz`cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetchz`cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetchz4cutlass::gemm::KernelTmaWarpSpecializedMxf8f6f4Sm120z<cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf8f6f4Sm120z0cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120z8cutlass::gemm::KernelTmaWarpSpecializedPingpongNvf4Sm120z0cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120z8cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf4Sm120z.cutlass::gemm::KernelScheduleSparseF8f6f4Sm120z@cutlass::gemm::KernelTmaWarpSpecializedBlockwiseCooperativeSm120z=cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120 _cpasync_cpasync_warpspecialized!_cpasync_warpspecialized_pingpong$_cpasync_warpspecialized_cooperative_unspecialized_warpspecialized_warpspecialized_pingpong_warpspecialized_cooperative_warpspecialized_fp8_fastaccum*_warpspecialized_cooperative_fp8_fastaccum'_warpspecialized_pingpong_fp8_fastaccum_1sm_2sm_q_1sm_q_2sm_o_vs32_1sm_o_vs32_2sm_o_vs16_1sm_o_vs16_2sm_o_vs16_ultra_1sm_o_vs16_ultra_2sm_o_vs32_ultra_1sm_o_vs32_ultra_2sm_o_vs16_ultra_1sm_nopf_o_vs16_ultra_2sm_nopf_o_vs32_ultra_1sm_nopf_o_vs32_ultra_2sm_nopf_o_vs16_ultra_1sm_tmapf_o_vs16_ultra_2sm_tmapf_o_vs32_ultra_1sm_tmapf_o_vs32_ultra_2sm_tmapf_cooperative_q_pingpong_q_cooperative_o_vs16_pingpong_o_vs16_cooperative_o_vs32_pingpong_o_vs32_qc                      e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            Z e            ZdS )EpilogueScheduleTypeN)r   r   r   r
   r   EpilogueTransposedNoSmemWarpSpecializedPtrArrayNoSmemWarpSpecializedNoSmemWarpSpecialized1SmNoSmemWarpSpecialized2SmFastF32NoSmemWarpSpecialized1SmFastF32NoSmemWarpSpecialized2Sm!BlockwiseNoSmemWarpSpecialized1Sm!BlockwiseNoSmemWarpSpecialized2Sm PtrArrayNoSmemWarpSpecialized1Sm PtrArrayNoSmemWarpSpecialized2Sm'PtrArrayFastF32NoSmemWarpSpecialized1Sm'PtrArrayFastF32NoSmemWarpSpecialized2Sm)PtrArrayBlockwiseNoSmemWarpSpecialized1Sm)PtrArrayBlockwiseNoSmemWarpSpecialized2Smr   r   TmaWarpSpecialized1SmTmaWarpSpecialized2SmPtrArrayTmaWarpSpecialized1SmPtrArrayTmaWarpSpecialized2Smr   r   r   r   r	   r:  r:  %  s/       , y{{#)++"+)++&Y[[&Y[[$-IKK!$-IKK!&/ikk#&/ikk#%.Y[["%.Y[[",5IKK),5IKK).7ikk+.7ikk+ y{{"+)++#)++#)++"+)++"+)++'0y{{$*3)++'''r   r:  z3cutlass::epilogue::collective::EpilogueScheduleAutoz!cutlass::gemm::EpilogueTransposedz(cutlass::epilogue::NoSmemWarpSpecializedz0cutlass::epilogue::PtrArrayNoSmemWarpSpecializedz+cutlass::epilogue::NoSmemWarpSpecialized1Smz+cutlass::epilogue::NoSmemWarpSpecialized2Smz2cutlass::epilogue::FastF32NoSmemWarpSpecialized1Smz2cutlass::epilogue::FastF32NoSmemWarpSpecialized2Smz4cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Smz4cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Smz3cutlass::epilogue::PtrArrayNoSmemWarpSpecialized1Smz3cutlass::epilogue::PtrArrayNoSmemWarpSpecialized2Smz:cutlass::epilogue::PtrArrayFastF32NoSmemWarpSpecialized1Smz:cutlass::epilogue::PtrArrayFastF32NoSmemWarpSpecialized2Smz<cutlass::epilogue::PtrArrayBlockwiseNoSmemWarpSpecialized1Smz<cutlass::epilogue::PtrArrayBlockwiseNoSmemWarpSpecialized2Smz%cutlass::epilogue::TmaWarpSpecializedz0cutlass::epilogue::TmaWarpSpecializedCooperativez(cutlass::epilogue::TmaWarpSpecialized1Smz(cutlass::epilogue::TmaWarpSpecialized2Smz0cutlass::epilogue::PtrArrayTmaWarpSpecialized1Smz0cutlass::epilogue::PtrArrayTmaWarpSpecialized2Smz8cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperativez5cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong_epi_nosmem_epi_nosmem_fastf32_epi_tmac                   6    e Zd Z e            Z e            ZdS )EpilogueFunctor3xN)r   r   r   r
   LinearCombination!LinearCombinationBlockScaleFactorr   r   r	   rR  rR  w  s(        ikk&/ikk###r   rR  z,cutlass::epilogue::fusion::LinearCombinationz2cutlass::epilogue::fusion::LinCombBlockScaleFactorc           
          | t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j	        f	v S rk   )
r:  r   r   r   rJ  rK  rL  rM  r   r   )epilogue_schedule_types    r	   is_tma_epiloguerW    sJ    	%+6..66>;
$ 

 
r   c                    |s| S i t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j	        t           j
        t           j        t           j        t          j        t          j        t          j        t          j        t          j        t          j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j         i t           j!        t           j"        t           j#        t           j$        t          j%        t          j&        t          j'        t          j(        t          j)        t          j*        t          j+        t          j,        t          j-        t          j.        t          j/        t          j0        t           j1        t           j2        t           j3        t           j4        t           j5        t           j6        t           j7        t           j8        t           j9        t           j:        t           j;        t           j<        t           j=        t           j>        t           j?        t           j@        t           jA        t           jB        t           jC        t           jD        t           jE        t           jF        t           jG        t           jH        i}||          S rk   )Ir   r   r   r   r   r   r   r   r   r   r   r   r   r:  r   r<  r=  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rJ  rL  rK  rM  r>  rD  r?  rE  rB  rH  rC  rI  r   r   r   r   r   r   r   r   r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  )schedulegroupedgroup_schedule_maps      r	   to_grouped_scheduler\    s-   	 O)47I7o) =@R  AB) :=O={	)
 17I7l) @CU  DH) =CU  DE) +9M9p) 69M9s) .9M9k) 13E3h) 13E3h) 58J8q) 58J8q)  58J8q!)" 58J8q#)$ 9<N<y%)& 9<N<y') )( :=O={))* :=O={+), .0D0b-). .0D0b/)0 13G3h1)2 13G3h3)4 :<P<z5)6 :<P<z7): @BT  CG;)< @BT  CG=)> @BT  CG?)@ @BT  CGA)B OQc  ReC)D OQc  ReE)F OQc  ReG)H OQc  ReI)J KM_  N]K) )L KM_  N]KM_  N]KM_  N]Q) )V 
H	%%r   c                   J    e Zd Z e            Z e            Z e            ZdS )TileSchedulerTypeN)r   r   r   r
   Default
PersistentStreamKr   r   r	   r^  r^    s/        IKK'y{{*IKK'''r   r^  z"cutlass::gemm::PersistentSchedulerzcutlass::gemm::StreamKScheduler	_stream_kc                   6    e Zd Z e            Z e            ZdS )SideModeN)r   r   r   r
   LeftRightr   r   r	   rd  rd    s$        	$
)++%%%r   rd  zcutlass::SideMode::kLeftzcutlass::SideMode::kRightlsrsc                   6    e Zd Z e            Z e            ZdS )FillModeN)r   r   r   r
   LowerUpperr   r   r	   rj  rj    s$        
)++%
)++%%%r   rj  zcutlass::FillMode::kLowerzcutlass::FillMode::kUpperluc                   6    e Zd Z e            Z e            ZdS )DiagTypeN)r   r   r   r
   NonUnitUnitr   r   r	   rp  rp    s$        IKK'	$$$r   rp  zcutlass::DiagType::kNonUnitzcutlass::DiagType::kUnitnuunc                   r    e Zd Z e            Z e            Z e            Z e            Z e            ZdS )OpcodeClassN)	r   r   r   r
   SimtTensorOpWmmaTensorOpSparseTensorOpBlockScaledTensorOpr   r   r	   rv  rv    sH        	$Y[[(,9;;.!	r   rv  simttensoropwmma_tensorop
sptensorop
bstensoropzcutlass::arch::OpClassSimtzcutlass::arch::OpClassTensorOpz"cutlass::arch::OpClassWmmaTensorOpz$cutlass::arch::OpClassSparseTensorOpz)cutlass::arch::OpClassBlockScaledTensorOpc                       e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
dS )OperationKindN)r   r   r   r
   GemmRankKRank2KTrmmSymmConv2dConv3dr   r   r	   r  r  )  s[        	$
)++%9;;&	$	$9;;&9;;&&&r   r  gemmrank_krank_2ktrmmsymmconv2dconv3dc                   "    e Zd Z e            ZdS )TargetN)r   r   r   r
   r   r   r   r	   r  r  >  r   r   r  maxwellpascalvoltaturingampereadahopper)2   <   =   F   K   P   Y   Z   `      c      )	r  H   r  r  V   W   r  r  d   c                     | }d}|rCd}|                                 D ]*\  }}d|z  }t          j        |||          }||k    rd}|}+|C|S )NTFz\$\{%s\})itemsresub)templatevaluestextchangedkeyvalueregexnewtexts           r	   SubstituteTemplater  \  sw    	$' Gllnn  
Uc!eueT**g	Ddd 	  
+r   c                      e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            Z e            Z e            Z e            Z e            ZdS )rl   N)r   r   r   r
   r  Sparse	UniversalUniversal3xSparseUniversal3xPlanarComplexPlanarComplexArrayGroupedrm   rw   rn   rs   rt   r   r   r	   rl   rl   l  s        	$9;;&ikk)	+ikk)++- y{{IKK'$9;; y{{"+)++" )	r   rl   spgemmgemm_planar_complexgemm_planar_complex_arraygemm_groupedc                   "    e Zd Z e            ZdS )	RankKKindNr   r   r   r
   r  r   r   r	   r  r            ikk)))r   r  c                   "    e Zd Z e            ZdS )TrmmKindNr  r   r   r	   r  r    r  r   r  c                   "    e Zd Z e            ZdS )SymmKindNr  r   r   r	   r  r    r  r   r  c                   6    e Zd Z e            Z e            ZdS )EpilogueFunctorN)r   r   r   r
   rS  LinearCombinationClampr   r   r	   r  r    s(        ikk$9;;r   r  z,cutlass::epilogue::thread::LinearCombinationz1cutlass::epilogue::thread::LinearCombinationClampc                   J    e Zd Z e            Z e            Z e            ZdS )MixedInputModeN)r   r   r   r
   ConvertOnly	ScaleOnlyScaleWithZeroPointr   r   r	   r  r    s2        	+ikk) y{{r   r  c                       e Zd Z e            Z e            Z e            Z e            Z e            Z e            Z	 e            Z
 e            Z e            ZdS )SwizzlingFunctorN)r   r   r   r
   	Identity1	Identity2	Identity4	Identity8
HorizontalStridedDgradIdentity1StridedDgradIdentity4StridedDgradHorizontalra  r   r   r	   r  r    st        ikk)ikk)ikk)ikk)y{{*#)++#)++$9;;IKK'''r   r  z=cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>z=cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>z=cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>z=cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>z<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzlezEcutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>zEcutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>zDcutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzlez5cutlass::gemm::threadblock::ThreadblockSwizzleStreamKc                   8    e Zd Z e            fZ e            ZdS )GroupScheduleModeN)r   r   r   r
   DeviceHostr   r   r	   r  r    s&        9;;<&	$$$r   r  z5cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnlyz9cutlass::gemm::kernel::GroupScheduleMode::kHostPrecomputer  r  c                       e Zd ZdZdZdZdS )ConvKindr   r   rU   N)r   r   r   FpropDgradWgradr   r   r	   r  r    s        
%
%
%%%r   r  zcutlass::conv::Operator::kFpropzcutlass::conv::Operator::kDgradzcutlass::conv::Operator::kWgradfpropdgradwgradc                       e Zd ZdZdZdS )ConvModer   r   N)r   r   r   CrossCorrelationConvolutionr   r   r	   r  r    s        +++r   r  c                   "    e Zd ZdZdZdZdZdZdS )IteratorAlgorithmr   r   rU      rV   N)r   r   r   Analytic	OptimizedFixedChannelsFewChannelsFixedStrideDilationr   r   r	   r  r    s*        ()-+r   r  z+cutlass::conv::IteratorAlgorithm::kAnalyticz,cutlass::conv::IteratorAlgorithm::kOptimizedz0cutlass::conv::IteratorAlgorithm::kFixedChannelsz.cutlass::conv::IteratorAlgorithm::kFewChannelsz6cutlass::conv::IteratorAlgorithm::kFixedStrideDilationanalytic	optimizedfixed_channelsfew_channelsfixed_stride_dilationc                       e Zd ZdZdZdZdS )StrideSupportr   r   rU   N)r   r   r   StridedUnityFixedr   r   r	   r  r    s        '
%
%%%r   r  z&cutlass::conv::StrideSupport::kStridedz$cutlass::conv::StrideSupport::kUnityz$cutlass::conv::StrideSupport::kFixedunity_stridefixed_stridec                   ^    e Zd Z e            Z e            Z e            Z e            ZdS )	GroupModeN)r   r   r   r
   	NoneGroupSingleGroupMultipleGroup	Depthwiser   r   r	   r  r  (  s:        ikk)	+)++-ikk)))r   r  zcutlass::conv::GroupMode::kNonez&cutlass::conv::GroupMode::kSingleGroupz(cutlass::conv::GroupMode::kMultipleGroupz$cutlass::conv::GroupMode::kDepthwisesingle_groupmultiple_group	depthwise)r   r   r   c                   $    e Zd Zej        dfdZdS )MathInstructionNc                 h    || _         || _        || _        || _        || _        || _        || _        d S rk   )instruction_shape	element_a	element_belement_accumulatoropcode_classmath_operationelement_scale_factor)selfr  r  r  r  r  r  r  s           r	   __init__zMathInstruction.__init__C  s@     /DDNDN2D$D(D 4Dr   )r   r   r   r   r   r  r   r   r	   r  r  B  s3         &3%?#	5 5 5 5 5 5r   r  c                   $    e Zd Zg ddfdZd ZdS )TileDescription)r   r   r   Nc	                     || _         || _        || _        || _        || _        || _        || _        || _        || _        d S rk   )	threadblock_shape
tile_shapestages
warp_countmath_instructionminimum_compute_capabilitymaximum_compute_capabilitycluster_shapeexplicit_vector_sizes)	r  r  r  r  r  min_computemax_computer   r!  s	            r	   r  zTileDescription.__init__U  sM    .D'DODK DO,D&1D#&1D#&D!6Dr   c           	      <   | j         dk    rcd                    | j        d         | j        d         | j        d         | j        d         | j        d         | j        d         | j                  S d| j        d         | j        d         | j        d         | j        fz  S )Nr  z${tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{s}r   r   rU   )tbmtbntbkcmcnckrG   z%dx%d_%dx%d)r  formatr  r   r  )r  s    r	   procedural_namezTileDescription.procedural_name`  s    &",,3::$Q'$Q'$Q'"""K ;    d4Q79OPQ9RTXTjklTmosoz{{{r   r   r   r   r  r,  r   r   r	   r  r  S  sL        xxx  Z^ 	7 	7 	7 	7| | | | |r   r  c                       e Zd Zd Zd ZdS ).Direct2dConvFixedStrideDilationTileDescriptionc
                     |d         |d         z  |d         z  |d         |d         |d         z  g| _         || _        || _        || _        || _        || _        || _        || _        || _        |	| _	        d S Nr   r   rU   r  
r  threadblock_output_shapefilter_shaper  r  stridedilationr  r  r  
r  r3  r4  r  r5  r6  r  r  r"  r#  s
             r	   r  z7Direct2dConvFixedStrideDilationTileDescription.__init__o      6q9:RST:UUVnopVqq  tL  MN  tO  Q]  ^_  Q`  am  no  ap  Qp  qD$<D!$DDK DODKDM,D&1D#&1D###r   c                    d| j         d         | j         d         | j         d         | j        d         | j        d         | j        d         | j        d         | j        | j        d         | j        d         f
z  }| j        ddgk    rE| j        ddgk    r8|d| j        d         | j        d         | j        d         | j        d         fz  z  }|S Nz#%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%dr   r   rU   r  z_stride%dx%d_dilation%dx%dr  r3  r  r4  r5  r6  r  str_names     r	   r,  z>Direct2dConvFixedStrideDilationTileDescription.procedural_name{      48Nq8Q&*&<Q&?&*&<Q&?&*&CA&F&*&CA&F&*&CA&F&*&CA&F&*k&*&7&:&*&7&:	8< 	<H {r2h4=RH#<#<.$+a.26+a.26-2B26-2B2D D Dh Or   Nr-  r   r   r	   r/  r/  n  2        
2 
2 
2    r   r/  c                       e Zd Zd Zd ZdS )r/  c
                     |d         |d         z  |d         z  |d         |d         |d         z  g| _         || _        || _        || _        || _        || _        || _        || _        || _        |	| _	        d S r1  r2  r7  s
             r	   r  z7Direct2dConvFixedStrideDilationTileDescription.__init__  r8  r   c                    d| j         d         | j         d         | j         d         | j        d         | j        d         | j        d         | j        d         | j        | j        d         | j        d         f
z  }| j        ddgk    rE| j        ddgk    r8|d| j        d         | j        d         | j        d         | j        d         fz  z  }|S r:  r<  r=  s     r	   r,  z>Direct2dConvFixedStrideDilationTileDescription.procedural_name  r?  r   Nr-  r   r   r	   r/  r/    r@  r   c                   $    e Zd Zdej        fdZdS )TensorDescriptionr   c                 >    || _         || _        || _        || _        d S rk   )elementlayout	alignmentcomplex_transform)r  rG  rH  rI  rJ  s        r	   r  zTensorDescription.__init__  s%    DLDKDN.Dr   Nr   r   r   rb   rc   r  r   r   r	   rE  rE    s1        23IYI^ / / / / / /r   rE  c                   0    e Zd Zdej        ej        fdZdS )SymmetricTensorDescriptionr   c                 Z    || _         || _        || _        || _        || _        || _        d S rk   )rG  rH  	fill_moderI  rJ  	side_mode)r  rG  rH  rO  rI  rJ  rP  s          r	   r  z#SymmetricTensorDescription.__init__  s1    DLDKDNDN.DDNNNr   N)r   r   r   rb   rc   rd  re  r  r   r   r	   rM  rM    s;        =>TdTiw  xE      r   rM  c                   $    e Zd Zdej        fdZdS )TriangularTensorDescriptionr   c                 h    || _         || _        || _        || _        || _        || _        || _        d S rk   )rG  rH  rP  rO  	diag_typerI  rJ  )r  rG  rH  rP  rO  rT  rI  rJ  s           r	   r  z$TriangularTensorDescription.__init__  s:    DLDKDNDNDNDN.Dr   NrK  r   r   r	   rR  rR    s1        STjzj / / / / / /r   rR  c                    | j         j        }| j         j        }| j        t          j        k    r| j        t          j        k    rt          | j
        j                 dk    rd}n t          | j
        j                 dk    rd}nd}t          | j
        j                 |d         z  |d         dz  z  dz  t          | j        j                 |d         z  |d         z  dz  z   |d         |d         dz  z  |z  z   }nt          | j
        j                 }t          | j
        j                 }|                                 rt          | j        j                 }||d         z  |d         z  dz  ||d         z  |d         z  dz  z   }||z  }|dz	  S )NrY   rU   rV   rW   r   r   
   )tile_descriptionr  r  operation_kindr  r  rp   rl   r  r   ArG  Bis_mixed_input)	operation	cta_shaper  elements_per_8b_mdsmem_per_stagedata_type_size_adata_type_size_b
smem_usages           r	   CalculateSmemUsagerc    s   (:)%,&!333	8Kx8^8^IK'(B..	ik)	*a	/	/!)+"561ESTYZIZ[_``!)+"561E	RSTXYYZq\Yq\Q%67;MMNNN
 $IK$78#IK$78!! ;%ik&9:%	!4y|CqH%	!4y|CqHIN &*

r   c                   "    e Zd ZdZdZdZdZdZdS )GemmUniversalModez.
  Types corresponding to GemmUniversalMode
  r   r   rU   r  N)r   r   r   __doc__r  GemmSplitKParallelBatchedArrayr   r   r	   re  re    s/          
$'
%%%r   re  c                       e Zd ZdZdZdZdZdS )
SplitKModez'
  Types corresponding to SplitKMode
  r   r   rU   N)r   r   r   rf  
NoneSplitKSerialParallelr   r   r	   rk  rk    s)          *&(((r   rk  (a  rf  enumr  r   r
   ImportErrorr   intEnumr   r   GeneratorTargetNamesr   r#   r%   r&   r/   r1   r3   r6   r8   r'   r(   r)   ShortDataTypeNamesr   r   r   r   r   r   r   r   r   r    r!   r"   r$   r+   r*   r,   r-   r.   r0   r2   r4   r5   r7   r?   r@   rA   rB   rC   rD   r9   r:   r;   r<   r=   r>   DataTypeNamesDataTypeTagr   r^   r_   r`   BlasModeTagrb   rc   rd   ComplexTransformTagComplexTransformTag3xrf   ri   rq   ru   rx   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   MathOperationTagr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	LayoutTagTransposedLayoutShortLayoutTypeNamesShortComplexLayoutNamesr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  KernelScheduleTagKernelScheduleSuffixesr:  r;  r<  r=  r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  EpilogueScheduleTagEpilogueScheduleSuffixesrR  rS  rT  EpilogueFunctor3xTagrW  r\  r^  r_  r`  ra  TileSchedulerTagTileSchedulerSuffixesrd  re  rf  SideModeTagShortSideModeNamesrj  rk  rl  FillModeTagShortFillModeNamesrp  rq  rr  DiagTypeTagShortDiagTypeNamesrv  rw  rx  ry  rz  r{  OpcodeClassNamesOpcodeClassTagr  r  r  r  r  r  r  r  OperationKindNamesr  ArchitectureNamesSharedMemPerCCr  rl   r  r  r  r  r  r  r  rm   rw   rn   rs   rt   GemmKindNamesr  RankKKindNamesr  TrmmKindNamesr  SymmKindNamesr  r  EpilogueFunctorTagr  r  r  r  r  r  r  r  r  r  SwizzlingFunctorTagr  r  r  GroupScheduleModeTagShortGroupScheduleModeNamesIntEnumr  r  r  r  ConvKindTagConvKindNamesr  r  r  r  r  r  r  IteratorAlgorithmTagIteratorAlgorithmNamesr  r  r  r  StrideSupportTagStrideSupportNamesr  r  r  r  r  GroupModeTagGroupModeNamesDynamicClusterShaper  r  r/  rE  rM  rR  rc  re  rk  r   r   r	   <module>r     sY&  B   				$$$$$$$    !S          di    9 / / / / /ty / / /f ,
-
-
,
,
,
-
-
+t
+t
+t /
-/
+t/ +t/ +t	/
 +t/ ,/ ,/ ,/ +t/ +t/ +t/ ,/ ,/ ,/ -/  -!/" +t#/ /$ +t%/& +t'/( -)/* -+/, --/. .'//0 .'1/2 ,3/4 -5/6 ,7/8 -9/: ,;/< -=/> .'?/@ -A/B .'C/D -E/ /F ,
,
,
-
-
-
,
,
,
-
-
-]/ /b/
-/
+"/ +"/ +"	/
 +y/ ,
/ ,
/ ,
/ +!/ +!/ +x/ ,	/ ,	/ ,	/ -(/  -(!/" +6#/ /$ +6%/& +6'/( -()/* -(+/, -(-/. .*//0 .*1/2 ,!3/4 -&5/6 ,7/8 -&9/: ,;/< -4=/> .9?/@ -*A/B .9C/D -+E/ /F ,5
,5
,4
-6
-6
-6
,4
,4
,3
-5
-5
-5]/ /b/
-/
+q/ +q/ +q	/
 +q/ ,/ ,/ ,/ +q/ +q/ +q/ ,/ ,/ ,/ -/  -!/" +q#/ /$ +q%/& +q'/( -)/* -+/, --/. .!//0 .!1/2 ,3/4 -5/6 ,7/8 -9/: ,;/< -=/> ."?/@ -A/B ."C/D -E/ /F ,
,
,
-
-
-
,
,
,
-
-
-]/ /f    ty    5
5    ty    ;@  )*  <<<   ` ` `\ \ \R R R
    * * *    	   ( ( ( ( (DI ( ( (  <%'M/1_44&(N%'L%'L-/[$&K-/\')P $       .82 $&R !#L	
 %'T "$N %'T "$N 4 6 8 6 8 @ @  @!" @#$ 468) 	2 *-z-$j&E!:#E%z'G"J$G%z'G"J$G.
 #$d %u %u	
 s !4 "E "E   '  ' Z Z  Z!" Z#$ ')  2 +013+013(-.(-.	 f9 f9 f9 f9 f9 f9 f9 f9Pg!#Rg!Bg +-Zg 35j	g
 68pg 4g ')Rg /1bg 24hg 35jg >  AAg ;=zg 35jg ;=}g  8:w!g$ /1b%g& /1b'g g* 79r+g, 79r-g0 79r1g2 79r3g6 57n7g8 57n9g< :<x=g> :<x?g@ 79rAgB 79rCgF 8:tGgH 8:tIgL @  CEMgN @  CEOgR 35jSgT 35jUgV 35jWg g gX 35jYg^ >  AL_g` >  ALagb >  ALcgd >  ALegh I  Lbigj I  Lbkgl I  Lbmgn I  Lbogr M  Pjsgt M  Pjugv M  Pjwgx M  Pjyg| :<x}g~ F  IQg@ 79rAgB C  FKCg g gF C  FNGgH @  CHIgL B  EIMgN B  EIOgP ;=zQgR ;=zSgT ;=zUgV ;=zWgX ?  BCYgZ ?  BC[g^ F  I\_g` F  I\agb F  I\cgd F  I\egf Q  Trggh Q  Trigj Q  Trkg gl Q  TrU  XzU  XzU  XzU  Xz?Aw<>|;=o8:t;=o8:tCEu@  CE=?~Mg g Tg!2gg +-Gg 35X	g
 68^g *g ');g /1Lg 24Rg 35Ug >@lg ;=fg 35Gg ;=[g  8:U!g$ /%g& /'g g* 7+g, 7-g0 71g2 73g6 5v7g8 5v9g< :F=g> :F?g@ 7AgB 7CgF 8&GgH 8&IgJ @&KgL @&MgP 3]QgR 3]SgT 3]Ug g gV 3]WgZ >@S[g\ >@S]g^ >@S_g` >@Sagd MOgegf MOgggh MOgigj MOgkgn IKdogp IKdqgr IKdsgt IKdugx :<Zygz FHt{g| 79T}g~ CEng g gB CEcCgD @B]EgH BFIgJ BFKgL ;]MgN ;]OgP ;]QgR ;]SgT ?UgV ?WgZ FH[[g\ FH[]g^ FH[_g` FH[agd UWoegf UWoggh UWoig gj UWoQSlQSlQSlQSl?AQ<m;=R8:L;=R8:LCT@BR=}Mg g R6 6 6 6 649 6 6 66#%Z)+N ,.X 46h	
 /1^ /1^ 68l 68l 8:p 8:p 79n 79n >@| >@| @  CA  @  CA!" )+R#$ 46h,.X,.X46h46h<>x9;r1  8#R)2 ,m 4m	
 / / 68M 68M 8- 8- 7 7 >@U >@U @-  @-!" ):#$ 4j,b,j4b4j<j9:1  62 2 2 2 2	 2 2 2 %'U57k   /& /& /&b    	    V D>  R[     ty    -+
.- -
.$     ty    .-
.- .#
.#     ty    1
-+ D
- $ $ $ $ $$) $ $ $ F
Ol!<  08@D!#N    DI    f)((     TY    	 	  	








 
   , , , , ,ty , , ," -
/8
f

h
/
:
N
!6
~
(.

&"    	   
 x
    ty   
 f
    ty   
 f
' ' ' ' 'di ' ' ' #%S(*] # # # # #TY # # #	 	 	 	 	ty 	 	 	 ]]]]](*q(*q)+qS
     	    SU  H&     t|    .3
.3
.3 .'
.'
.'    t|   
    	    KM!#U!Q')a  j{!#3')@     DI    A==  ~~     	    8AE=	 r+{	  ii 
5 5 5 5 5 5 5 5"| | | | | | | |6       B       B/ / / / / / / /       / / / / / / / /  <                s    ##