
    )`i5                     ~   d dl Z d dlZd dlmZmZ ddlmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZ ddlmZ  G d de j                  Z G d	 d
e j                  Zej        dej        dej        dej        diZej        dej        dej        dej        diZej        dej        diZ ddej        dej        diZ! G d de j                  Z"e"j#        de"j$        de"j%        de"j&        diZ'e"j#        de"j$        de"j%        d e"j&        d!iZ( G d" d#          Z) e)            Z*d$ Z+dJd%Z,e*d&e	j-        d'e	j.        d(e	j/        d)e	j0        d*e	j*        d+e	j1        d,e	j2        d-iZ3 G d. d/          Z4d0 Z5d1 Z6d2 Z7d3 Z8d4 Z9d5 Z:d6 Z;d7 Z<d8 Z=d9 Z>d: Z?d; Z@d< ZAd= ZBd> ZCd? ZDd@ ZEdA ZFdB ZGdC ZHdD ZI G dE dF          ZJdG ZKdH ZLdI ZMdS )K    N)chainproduct   )	enum_autoDataTypeNamesDataTypeSizeDataTypeDataTypeTagGemmKindGemmKindNamesKernelScheduleTypeKernelScheduleTagKernelScheduleSuffixesEpilogueScheduleTypeEpilogueScheduleTagEpilogueScheduleSuffixes   )is_cuda_version_at_leastc                   ^    e Zd Z e            Z e            Z e            Z e            ZdS )TrtLlm_EpilogueTagN)__name__
__module____qualname__r   epilogue_op_defaultepilogue_op_biasepilogue_op_siluepilogue_op_gelu     /home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/jit/gemm/cutlass/generate_kernels.pyr   r      s@        #)++ y{{ y{{ y{{r   r   c                   6    e Zd Z e            Z e            ZdS )TrtLlm_EpilogueFusionN)r   r   r   r   epilogue_fusion_noneepilogue_fusion_finalizer   r   r    r"   r"       s(        $9;;(y{{r   r"   lclc_biassilugeluz3tensorrt_llm::cutlass_extensions::EpilogueOpDefaultz0tensorrt_llm::cutlass_extensions::EpilogueOpBiasz7tensorrt_llm::cutlass_extensions::EpilogueOpDefaultSiluz9tensorrt_llm::cutlass_extensions::EpilogueOpDefaultFtGeluzFtensorrt_llm::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONEzJtensorrt_llm::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE EpilogueFusion_NONEEpilogueFusion_FINALIZEc                   ^    e Zd Z e            Z e            Z e            Z e            ZdS )TrtLlm_QuantOpN)r   r   r   r   per_column_scale_onlyfinegrained_scale_onlyfinegrained_scale_and_zerosnoner   r   r    r-   r-   A   s=        %IKK&Y[["+)++9;;DDDr   r-   csfgsfgsznoquantz1cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLYz2cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLYz7cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROSvoidc                       e Zd ZdS )	e2m1_typeN)r   r   r   r   r   r    r8   r8   [   s        Dr   r8   c                 J    t          | t                    rdS t          |          S )N   )
isinstancer8   r   )types    r    GetDataTypeBitsr=   b   s$    $	"" qr   c                 j    d}||rdnd}t          | t                    r|dz   S |t          |          z   S )Nr)   mx_nv_e2m1)r;   r8   r   )r<   	is_mx_fpxmxprefixs      r    GetDataTypeNamesrD   h   sH    H%0555$	"" !&  mD)))r   SafeFP4__nv_fp8_e4m3__nv_bfloat16halffloat__nv_fp4_e2m1zcutlass::float_ue8m0_tzcutlass::uint4b_tc                   $    e Zd Z	 	 	 	 ddZd ZdS )TrtLlm_GemmLauncherNFc                    || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        d S N)	gemm_kindarchact_typeweight_typescalezero_type	bias_typeoutput_typequant_opepi_tag	cta_shape
warp_shapestages	cga_shapedynamic_cgamainloop_scheduleepi_schedule
epi_fusionrB   swap_ab)selfrO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r]   r^   r_   rB   r\   r`   s                       r    __init__zTrtLlm_GemmLauncher.__init__   s    , #	 &,"& "$"&!2($"r   c                 v   d                     t          | j                 | j        t	          | j        | j                  t	          | j        | j                  t	          | j                  t	          | j	                  t	          | j
                  t          | j                 t          | j                 | j        d         | j        d         | j        d         | j        d         | j        d         | j        d         | j                  }d                     | j        d         | j        d         | j        d         t&          | j                 t*          | j                 t.          | j                 | j        rdnd| j        rdnd          }| j        d	k    r||z   S | j        d
k    rt5          d| j         d          |S )Nz1{}_sm{}_{}_{}_{}_{}_{}_{}_{}_{}x{}x{}_{}x{}x{}_{}r   r      z_{}x{}x{}{}{}{}{}{}_mxfpx_r)   _swap_abZ   d   SMz not supported yet.)formatr   rO   rP   rD   rQ   rB   rR   rS   rT   rU   QuantOpNamesrV   EpiTagNamesrW   rX   rY   rZ   r[   r   r]   r   r^   EpiFusionSuffixesr_   r`   
ValueError)ra   kernel_prefixhopper_suffixs      r    __repr__zTrtLlm_GemmLauncher.__repr__   s}   KRR$.)IT]DN;;T-t~>>T011T^,,T-..'%N1N1N1OAOAOAK!
 
& .44N1N1N1"4#9:$T%67do./IIR,.JJB	
 	
 9?? =00Y__@$)@@@AAAr   )NFFF)r   r   r   rb   rq   r   r   r    rL   rL      sF        $ )( ( ( (T# # # # #r   rL   c                 >    d| d          d| d          d| d          dS )Nzcute::Shape<cute::Int<r   z>, cute::Int<r   rd   z>>r   )shapes    r    tuple_to_cute_shapert      s2    ^E!H^^58^^RWXYRZ^^^^r   c                    t           | j                 }t           | j                 }t           | j                 }t           | j                 }t
          | j                 }t          | j                 }t          | j
                  }t          | j                  }t          | j                 }	d}
| j        t          | j                 }
| j        t"          j        k    rt&          | j                 }d                    g d| d| d| d| d| d| d| d| d| d|	 d|
 d| d| d| d| d| d| d	          }n | j        t"          j        k    r| j        | j        k    rc| j        t.          j        k    s| j        t2          k    r>t           | j                 }d
| d| d| d| d| d| d|	 d|
 d| d| d| d| d| d}nw| j        t4          j        t4          j        fv sJ |	                    dd           d| j         }t           | j                 }| j        J t@          | j                 }|!                    d          d         }|!                    d          d         }|
!                    d          d         }
|
                    dd          }
t2          dt.          j        dt.          j"        di}|#                    | j        d          }|#                    | j        d          }tI          | j%                  &                                }tI          | j'                  &                                }tI          d          &                                }tI          | j(                  &                                }d                    g d| d| d| d| d| d| d|
 d| d| d| j
        d          d| j
        d          d| j
        d          d| j        d          d| j        d          d| j        d          d| d| d| d| d          }|S ) Nr6   r)   z6
template void sm90_generic_mixed_gemm_kernelLauncher<, z,
z
> (
const z	*, const z*, const float,
zv*, int, int, int, const int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
);z:
template void sm90_generic_mixed_moe_gemm_kernelLauncher<z> (
GroupedGemmInput<zc>inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size);
z::Kernelz::KernelGroupedSm:1Smzdefined(ENABLE_FP4)zdefined(ENABLE_FP8)zdefined(ENABLE_BF16)1Fz
#if z && z3
        INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM(z
,
        r   r   rd   z	);
#endif))CudaTypeNamerQ   rS   rT   rU   
QuantOpTagrV   EpiTagrW   rt   rX   r[   r   r]   r^   r   rO   r   Gemmr
   rR   joinGroupedr	   e4m3rA   r   TmaWarpSpecializedCooperative)TmaWarpSpecializedCooperativeFP8FastAccumreplacerP   r_   	EpiFusionsplitbf16getstrrB   lowerr\   r`   )	operationact_tagscale_zero_tagbias_tagout_tagrV   rW   cute_cta_shapecute_cga_shapekernel_sched	epi_sched
weight_taginstantiationarch_tagr_   	guard_map	guard_actguard_weightrB   use_dynamic_cgause_biasr`   s                         r    *instantiate_operation_tma_warp_specializedr      s{   9-.G!)":;NI/0H901G),-HY&'G()<==N()<==N$Y%@ALI)'	(>?	hm++ !67
     6=   AK   O]   ai   mt   	           "    	    	   
    
 $   
 /=   
 HV   
 ai    	      
	 0	0	0!666(-//93HD3P3P &i&;<J:A EO SZ   , 0< @I MU    (  ,3  7>  MM ."@"L3       ->??? -IN,,H%i&;<J'333"9#78J $))#..r2JmmC((,G!,,R0I!))r I
 +45I
 "i&8#>>I$==)>DDLI/006688I!)"788>>@@O5zz''))H)+,,2244G
 
 
 
 
 

 
 
 
 
 
 
 
3;
 
 
 
?F
 
 
 
JT
 
 
 
X_
 
 
 
 

 
 
 
 
 
 
 
 ",
 
 
 
 
	Q		
 
 
 
 $-#6q#9	
 
 
 
 >G=PQR=S	
 
 
 
 XaWjklWm	
 
 
 
 r{  rE  FG  rH	
 
 
 
 LU  L_  `a  Lb	
 
 
 

 

 
 
 

 &
 
 
 

 *2
 
 
 

 6=
 
 
 
 
 
M r   c                    t           | j                 }t           | j                 }t          | j                 }d| d| d| j        d          d| j        d          d| j        d          d| j         d| d| d| d| d	| d
}|S )NzF
            template void sm80_generic_fused_moe_gemm_kernelLauncher<rv   r   r   rd   z>
                    (z const* A, z const* B, z( const* biases, bool bias_is_broadcast, z* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);)r
   dtyper~   rW   rX   stage)r   r   r   rW   r   s        r    instantiate_operation_sm80r   -  s,   )/*GY_-JY&'G{FM{ {Q[{ {_h_rst_u{ { zC  zM  NO  zP{ { T]  Tg  hi  Tj{ { nw  n}{ { AH{ {{ {*4{ {AH{ {ry{ { {M r   c                 n    | j         dk    rt          |           S | j         dk    rt          |           S d S )NP   rg   )rP   r   r   )r   s    r    instantiate_operationr   8  s@    ~))444	2		9)DDD 
	r   c                 4   |sJ t                      }| D ]}|                    d| d           d                    |          }t                      }|D ]$}|                    t          |                     %d                    |          }| d| d}|S )Nz
#include ""
zO
namespace tensorrt_llm
{
namespace kernels
{
namespace cutlass_kernels_oss
{

zX

} // namespace cutlass_kernels_oss
} // namespace kernels
} // namespace tensorrt_llm
)listappendr   r   )	launcher_inl_files
operationsinclude_listfileincludes
insts_listopinstantiationsfile_contents	            r    get_file_contentr   ?  s    :66L" 2 200001111yy&&HJ 5 5/334444YYz**N      L r   c                     t          j        |           D ]C\  }}}|D ]:}t           j                            ||          }||vrt          j        |           ;DdS )zARemove leftover generated files that weren't created in this run.N)oswalkpathr   remove)
output_dirgenerated_filesroot_dirsfilesr   	file_paths          r    clean_leftover_filesr   \  sq     gj11 % %eU 	% 	%DT400I//	)$$$	%% %r   c                    t          j        t           j                            |          d           t	          | |          }	 t          |d          5 }|                                |k    r	 d d d            d S 	 d d d            n# 1 swxY w Y   n# t          $ r Y nw xY wt          |d          5 }|                    |           d d d            d S # 1 swxY w Y   d S )NT)exist_okr)modew)	r   makedirsr   dirnamer   openreadFileNotFoundErrorwrite)r   r   output_filecontentfs        r    
write_filer   e  s   K,,t<<<<1:>>G+C((( 	Avvxx7""	 	 	 	 	 	 	 	"	 	 	 	 	 	 	 	 	 	 	 	 	 	 	    	k	$	$	$ 	                 sN   B B
0B >B 
BB BB 
B#"B#7CC!Cc                    | j         \  }}}| j        \  }}}| j        t          j        k    r| j        t          j        k    rdS |dk    s|dk    s|dk    rdS | j        dk    r*| j	        t          k    o| j        t          k    o	|dk    o|dv S |dvrdS | j	        t          k    s| j        t          k    r.|dvs|dk    rdS | j        d	k    r| j        t          j        k    rdS | j	        t          j        k    r'|d
k    s|dk    r|dk    r|dk    r|dk    p|d
z  dk    S |dz  dk    s|dk     s|dk    rdS |dz  dk    r|dz  dk    rdS dS )NFrd   r   g      )r      )@   r   )r   r   r   rh         r       r   r   T)rX   r[   r_   r"   r$   r^   r   PtrArrayTmaWarpSpecialized1SmrP   rQ   rA   rR    PtrArrayNoSmemWarpSpecialized1Smr	   r   )r   tile_mtile_n_cga_mcga_ncga_ks          r    is_gemm_op_valid_sm100r   s  s   FFA,E5% 	.GGGO3QQQu qyyEQJJ%1**u	w#~~K4 %$&%#% *$		
 Yu 
{dbn44''6S==5 GsNN#7#XXX5 
	%	%r\\Vq[[aZZEQJJ }0q 00 {a6B;;&3,,u qyA~~&2+**u4r   c                     | j         \  }}}| j        \  }}}|dk    r|dk    rdS |dk    r|dk    r|dk    rdS |dk    r|dk    r|dk    rdS |dk    r|dk    r|dk    r|dk    rdS dS )Nr   Trd   r   F)rX   r[   )r   r   r   r   r   r   s         r    is_gemm_op_validr     s    FFAlOE5!zzeqjjtzzeqjjVs]]tzzeqjjVs]]tzzeqjjVs]]v}}t5r   c                     t          |           sdS | j        t          j        k    rdS | j        | j        t
          j        k    rdS | j        t          j	        t          j
        fvrdS dS )NFT)r   rW   r   r   r^   r   NoSmemWarpSpecializedr]   r   r   r   r   s    r    is_grouped_gemm_op_validr     sz    B u	z';;;u 	#O3IIIu	8D$   u4r   c                     | j         dk    rt          |           S | j        t          j        k    rt          |           S | j        t          j        k    rt          |           S d S Nrh   )rP   r   rO   r   r   r   r   r   r   s    r    is_op_validr     s^    	w#~~%b)))	|x}$$###	|x''''+++ ('r   c            
      &   d} t           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        fg}t          j        t          j        t          j	        g}t          j        g}ddg}g d}t          ||          }g d}d}t          ddgddgdg          }	t          |||||	          }
t                      }|
D ]\  }}}}}d	}|t          |d                   z  }||fz   }|d         dk    }|rt          j        nt          j        }|rt$          j        nt$          j        }t)          t*          j        | g|||||||||R  }t/          |          r|                    |           |S )
Nrg   r   r   r   r   r   r   r   )r:   r   r   r   r   rd      )r	   r   u4f16r   u8r-   r.   r/   r0   r   r   r   r   r=   r   r   TmaWarpSpecializedPingpongr   TmaWarpSpecializedrL   r   r   r   r   )rP   supported_dtypes	quant_opsepi_tagsM_TILESN_TILEScta_shapes_mnrY   rZ   
cga_shapespartial_argsr   dtype_comborV   rW   cta_shape_mnr[   
max_k_bitscta_shape_kcta_shape_mnkuse_coopr]   r^   fpA_intB_operations                           r    #generate_sm90_mixed_gemm_operationsr     sm   D 
X\8<N	X\8=(-P	x{HL(,M	X]HM8=Q	x{HL(,M	X]HM8=Q 	,-2I #34H3iG$$$GGW--MJF!Q!Q!--J)X}j L JCO  2  2?Xwi
 OKN$C$CC$~5?c) ?<<#> 	 9 >>%8 	 1M
 
 	

 
 
 
 
 
 
 
 
 
 )** 	20111r   c                 F   | sg S d}t           j        t           j        t           j        t           j        g}t
          j        g}t          j        g}dg}g d}t          t          ||                    dgz   }g d}d}	t          j        t          j        g}
ddg}t          d	d
gd	d
gd	g          }t          ||||
|||          }t                      }|D ]\  }}}}}}}d}|t          |          z  }||fz   }|t           j        k    rt          j        nt          j        }d }|g}|t           j        k    rt           j        t           j        g}|D ]P}t%          t&          j        |||||||||||	|||||          }t+          |          r|                    |           Q|S )Nrg   r   r   )r   r   r   r   r   r   TFr   rd   r   )r`   )r	   r   r   f32r   r-   r1   r   r   r   r   r"   r#   r$   r=   r   r   r   rL   r   r   r   r   )is_arch_enabledrP   r   r   r   r   r   r   rY   rZ   epi_fusionsr`   r   r   r   r   rV   rW   r_   r   r[   r   r   r   r]   r^   otypesotypemoe_gemm_operations                                r    %generate_sm90_grouped_gemm_operationsr	  ,  s    	D hmX\8=Q$%I"67HeG$$$G'2233zlBMJF 	26K
 UmG!Q!Q!--J L J 
.6 .6 	
 OE$:$::$~5 %% <<#M 	
 HM!!lHM2F 	6 	6E!4 !#" " "( -.. 6!!"4555-	6. r   c                 H   | sg S d}t           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        fg}t          d          rst           j        t           j        t           j        t           j        t           j        ft           j        t           j        t           j        t           j        t           j        fg}ng }t          j	        g}t          j        g}ddg}g d}g d}t          t          |||                    }	ddg}g d}ddg}t          t          |||                    }
|
                    d	           g d
}d}t          t          ddgddgdg                    }t          ||||	|          }t          ||||
|          }t          ||          }t                      }|D ]\  }}}}}|d         dk    }|rt           j        t           j        gnt           j        g}t&          j        }|D ]h}|d         dk    r|d         dk    r|t           j        k    r+t)          t*          j        |g|||||||||R  }|                    |           i|S )Nrg   z12.8r   r   )r   r   r   r   )r   r   i   )r   r   r   r   r   r   r   r  r   r   rd   )r	   r   r   r   r   r   rA   ue8m0r-   r/   r   r   r   r   r   r   r   r   r   r   rL   r   r   )r  rP   supported_dtypes_int4supported_dtypes_fp4r   r   r   r   K_TILEScta_shapes_mnk_int4cta_shapes_mnk_fp4rY   rZ   r   partial_args_int4partial_args_fp4r   r   r   rV   rW   r   r[   r   mainloop_schedulesr^   r]   r  s                               r    0generate_sm90_mixed_type_grouped_gemm_operationsr    s    	D 
X\8<N	X]HM8=Q
  '' "\8=(.(,U	 
  "67I"67H3iGGooGwwAABB3iGllGCjGggw@@AAo...JFgq!fq!fqc2233Jy(4G  i3Ez  *,<==LJDP  2  2@Xwy #s* 	A"@"= 
 %?@ 	 ,I!3 	2 	2a C''!!$++%%CD D !4 " " 	"
 " " " " " "" " " " 01111+	2, r   c                     t                      }|                    t          |                      |                    t          |                      |S rN   )r   extendr	  r  r  r   s     r    generate_sm90_operationsr    sM    466J;OLLMMMFWWXXXr   c                 r    d}|t          |          z  }|t          j        k    r| d         dk    rd}| |fz   S )Nr   r   r   r   )r=   r	   r   )r   r   r   r   s       r    !calc_shape_mnk_sm100_grouped_gemmr    sF    J 6 66K<?a#7#7;.((r   c                    | sg S d}t           t          j        t           fg}t          j        g}t
          j        g}g dg dg dg dg}g d}d}t          j        t          j	        g}g dg}	d	d
g}
t          ||||||	|
          }t                      }|D ]\  }}}}}}}
t          j        }d }t          |t                    r|\  }}n||}}|t          j        k    r|t           k    r	|g dk    r[|g}|t          j        t           fv rt          j        t          j        g}|D ][}t%          t&          j        ||||||||||||||||t          j        k    o
|t           k    |
          }|                    |           \|S )Nx   r  )r   r   r   )r   r   r   )r   r   r   r  r   r   r   r   TF)rB   r`   )rA   r	   r   r-   r1   r   r   r"   r#   r$   r   r   r   r   r;   tupler   r   rL   r   r   r   )r  rP   r   r   r   cta_shapes_mnkrY   rZ   r  r   r`   r   r   r   rV   rW   r_   r   r[   r]   r^   rQ   rR   r  r  r  s                             r    &generate_sm120_grouped_gemm_operationsr!    s    	Dx}d34$%I"67H	N JF 	26K
 ))JUmG L J 
12 12 	 /LeU## 	1$)!Hkk$)5kH x}$$)<)<//t,,,lHM2F 	2 	2E!4 !#x}4L9L%" " "* 01111-	2. r   c                 $    t          |           }|S rN   )r!  r  s     r    generate_sm120_operationsr#  C  s    7HHJr   c                    | sg S t           j        t           j        t           j        t           j        t
          t           j        t
          fg}t          j        g}t          j	        g}ddg}g d}t          ||          }g d}d}	t          j        t          j        g}
t          j        t          j        g}ddg}dd	g}dd	g}t          ||||
|||||	  	        }t#                      }|D ]\	  }}}}}}}}}t%          |t&                    r|\  }}n|}t)          ||          }t*          j        }|g}|t           j        t
          fv rt           j        t           j        g}|D ]k}t/          t0          j        |||||||||||	|||||t           j        k    o
|t
          k    ||
          }t5          |          r|                    |           l|S )Nr   r   )r   r   r   r   r      r   r  r   r  )rd   r   r   TF)rB   r\   r`   )r	   r   r   r  r   rA   r-   r1   r   r   r   r"   r#   r$   r   r   r   r   r;   r  r  r   r   rL   r   r   r   r   )r  rP   r   r   r   cta_shapes_mcta_shapes_nr   rY   rZ   r  epi_schedulesr   r`   r\   r   r   r   rV   rW   r_   r   r[   r^   rR   r   r]   r  r  r  s                                 r    &generate_sm100_grouped_gemm_operationsr)  H  s8    		  $%I"67H9L111LL,77MJF 	26K 	=:M Y'JUmG-K
 
L J 
16 16 
	eU## 	 !&E;;K9,NN /LX]D)))lHM2F 	6 	6E!4 ! HM1IkT6I''" " ", -.. 6!!"45551	62 r   c                 &    t          | d          }|S )Nr   r)  r  s     r    generate_sm103_operationsr,        7MMJr   c                 &    t          | d          }|S r   r+  r  s     r    generate_sm100_operationsr/    r-  r   c                       e Zd Zd ZdS )GemmSm80LauncherConfigc                 Z    || _         || _        || _        || _        || _        || _        d S rN   )rO   rP   r   rW   rX   r   )ra   rO   rP   r   rW   rX   r   s          r    rb   zGemmSm80LauncherConfig.__init__  s0    "	
"


r   N)r   r   r   rb   r   r   r    r1  r1    s#            r   r1  c            	      4   d} t           j        t           j        g}t          j        t          j        g}g d}g d}t          ||||          }t                      }|D ]:\  }}}	}
t          t          j
        | |||	|
          }|                    |           ;|S )Nr   ))r   r   r   )r   r   r   )r   r   r   )r   r   r   )r   r   r   )rd   r   r:   )r	   r   r   r   r   r   r   r   r1  r   r   r   )rP   r   r   r   rZ   r   r   r   rW   r   r   items               r    +generate_sm80_fused_grouped_gemm_operationsr5    s    D hm4++H  N YYF+X~vNNLJ0<    ,wu%dE7M5
 
 	$r   c                 "    t                      }|S rN   )r5  r  s     r    generate_sm80_operationsr7    s    <>>Jr   c                 >   |                     d          t          j                            |           } d}d}d}d}t          j        df|gt          j        df|gt          j        df|gt          j        df|gt          j        d	f|gt          j        d
f|gi}fd}g }|t           |d	          p
 |d                    z  }|t           |d                    z  }|t           |d          p
 |d                    z  }|t           |d                    z  }|t           |d
          p
 |d                    z  }d }	d }
d}t                      }|D ]} |	|          r|j        |j        |j        d         |j        dk    o|j        t"          k    p|j         |
|          f}|                    |g           }t)          |          dk    st)          |d                   |k    r|                    |g           n|d                             |           |||<   g }|                                D ]\  }}|\  }}}}}t/          |          D ]\  }}t          j                            | t2          |         t5          |          dt2          |          d| d| |rdnd |rdnd d| d          }|r|gn||d d                  }t7          |||           |                    |           t9          | t;          |                     d S )N;zWtensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inlzTtensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inlz`tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inlzXtensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inlrg   rh   r   r  r   c                 8    |  v p|  dv p|  dv p|  dv S )Nz-realzf-realr   r   )smarchess    r    has_archz*generate_gemm_operations.<locals>.has_arch  sM    Gv "|||v%"}}}&" xxx6!		
r   y   Y   c                     dS NFr   r   s    r    should_skipz-generate_gemm_operations.<locals>.should_skip  s    ur   c                     t          | t                    rdS | j        | j        k    o9| j        t
          j        k    o$| j        t          j        k    p| j        t          k    S rA  )
r;   r1  rQ   rR   rO   r   r   r	   r   rA   r   s    r    is_mixed_dtype_groupedz8generate_gemm_operations.<locals>.is_mixed_dtype_grouped  s^    b011 	5 [BN* I!11I-G41G	
r   r   r   ry   cutlass_kernel_file__sm_M_BSr)   _Mixed_groupz.generated.curd   )r   r   r   abspathr   r   r   r#  r,  r/  r  r7  dictrO   rP   rX   rR   rA   rB   r   lenr   items	enumerater   r   r   r   r   set)r   architecturesfpA_intB_inlmoe_gemm_inlmoe_mixed_gemm_inlsm80_moe_gemm_inlinl_mapr=  r   rB  rD  
GROUP_SIZE	op_groupsr   dict_keyop_group	file_listkeyvaluerO   rP   mblock_scaleis_mixediop_sub_groupout_fileinl_filer<  s                               @r    generate_gemm_operationsre    s     %%F,,JlLiL{r 
l^		2		3,		3,		3,		2!2 3G
 
 
 
 
 J+HHSMM,JXXc]]KKKJ+HHSMM:::J+HHSMM,JXXc]]KKKJ*88B<<888J*88B<<+G88B<<HHHJ  
 
 
 JI ' ';r?? 	 LGLOGsNG$ 6 F",""2&&
 ==2..x==AXb\!2!2j!@!@OORD!!!!RL###&	(Ioo'' ' '
U471	4K(// 		' 		'OA|w||i(D		 b}Y'?  b  bD  b  bA  bXcOkuuik  b  zB  nJmumu  HJ  b  b  RS  b  b  b	 H 08M*++WS!W=MHxx888X&&&&		' S^^44444r   rN   )Nenumr   	itertoolsr   r   cutlass_libraryr   r   r   r	   r
   r   r   r   r   r   r   r   r   cpp_extr   Enumr   r"   r   r   r   r   rl   r~   r#   r$   r   rm   r-   r.   r/   r0   r1   rk   r}   r8   rA   r=   rD   r   r   r   r  r  r   r|   rL   rt   r   r   r   r   r   r   r   r   r   r   r   r	  r  r  r  r!  r#  r)  r,  r/  r1  r5  r7  re  r   r   r    <module>rk     s    				 $ $ $ $ $ $ $ $                              0 / / / / /
# # # # # # # #+ + + + +DI + + + *D'''	 *,a')[')b')d	
 .0x2  5A	 	".0E24M     TY    ($)5.	 (*])+_.0i	
	 	 	 	 	 	 	 	 y{{  * * * * 	)M?M?L&L'M?N,K$	N N N N N N N Nd_ _ _U U Up  E E E  :% % %  8 8 8v  &  ,, , ,G G GTQ Q QhW W Wt  ) ) )W W Wt  
e e eP  
  
         8  
^5 ^5 ^5 ^5 ^5r   