
    )`ipO                     4   d Z ddlZddlmZ ddlZddlZddlmZmZ ddl	m
Z ddlmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZ defdZdefdZdefdZdefdZdefdZdefdZdefdZ defdZ!ej"        dfdej#        de$defdZ%defdZ&defdZ'dS )a3  
Copyright (c) 2024 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)product   )ArtifactPathCheckSumHash   )env)JitSpecgen_jit_specsm90a_nvcc_flagssm100a_nvcc_flagssm100f_nvcc_flagscurrent_compilation_context)	get_cubinget_meta_hash)dtype_cutlass_mapfilename_safe_dtype_mapwrite_if_differentreturnc                  ~    t          dt          j        dz  t          j        dz  t          j        dz  gddg          S )Ngemmz
bmm_fp8.cuzgroup_gemm.cuzflashinfer_gemm_binding.cuz-lcublasz
-lcublasLt)extra_ldflags)r
   jit_envFLASHINFER_CSRC_DIR     l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/jit/gemm/core.pygen_gemm_moduler   %   sM    ',6'/9'*FF	

 "<0   r   c                  N   t           j        dz  } t          j        | d           t           j        dz  g}t          t           j        dz            5 }t          j        |                                          }ddg}g d}|D ][\  }}}|D ]R}	| d	|	 d
| d
| d
| d	z  }
|	                    |
           |
                    |	|||          }t          |
|           S\	 d d d            n# 1 swxY w Y   t          j        g d          }t          d||ddgz   dg          S )Ngen_gemm_sm100_cutlass_fp4Texist_okzfp4_gemm_cutlass.cuzfp4_gemm_cutlass.jinja__nv_bfloat16half)   @   r%   r%      r%   )r%   r%   r(   )r%   r(   r(   fp4_gemm_cutlass__.cutypecta_mcta_ncta_k
         supported_major_versionsfp4_gemm_cutlass-DENABLE_BF16-DENABLE_FP4-DFAST_BUILDextra_cuda_cflagsextra_cflagsr   FLASHINFER_GEN_SRC_DIRosmakedirsr   openjinja2Templatereadappendrenderr   r   get_nvcc_flags_listr
   gen_directorysource_pathsfkernel_inst_templ
dtype_listcta_m_n_k_listr.   r/   r0   dtype	dest_pathsource
nvcc_flagss                r   !gen_gemm_sm100_module_cutlass_fp4rT   1   s   25QQMK----#&;;L 
g),DD	E	E 6"OAFFHH55%v.

 
 
 $2 	6 	6E5%# 6 6!L%LL%LL%LL%LLLM  ##I...*11	 2   #9f55556	66 6 6 6 6 6 6 6 6 6 6 6 6 6 60 -@!-  J $

 
      BC,,C03C0c                  J   t           j        dz  } t          j        | d           t           j        dz  g}t          t           j        dz            5 }t          j        |                                          }ddg}dg}|D ][\  }}}|D ]R}	| d	|	 d
| d
| d
| d	z  }
|	                    |
           |
                    |	|||          }t          |
|           S\	 d d d            n# 1 swxY w Y   t          j        dg          }t          d||ddgz   dg          S )Ngen_gemm_sm120_cutlass_fp4Tr    zfp4_gemm_cutlass_sm120.cuzfp4_gemm_cutlass_sm120.jinjar"   r#   r%   r%   r%   r)   r*   r+   r,   r4   r5   fp4_gemm_cutlass_sm120r8   r9   r:   r;   r>   rI   s                r   !gen_gemm_sm120_module_cutlass_fp4rZ   a   s   25QQMK----#&AAL 
g),JJ	K	K 6q"OAFFHH55%v.
 
 $2 	6 	6E5%# 6 6!L%LL%LL%LL%LLLM  ##I...*11	 2   #9f55556	66 6 6 6 6 6 6 6 6 6 6 6 6 6 6, -@"$  J  $

 
   s   BC++C/2C/c                  L   t           j        dz  } t          j        | d           t           j        dz  g}t          t           j        dz            5 }t          j        |                                          }ddg}g d}|D ][\  }}}|D ]R}	| d	|	 d
| d
| d
| d	z  }
|	                    |
           |
                    |	|||          }t          |
|           S\	 d d d            n# 1 swxY w Y   t          j        g d          }t          d||dgz   dg          S )Ngen_gemm_sm100_cutlass_fp8Tr    zfp8_gemm_cutlass.cuzfp8_gemm_cutlass.jinjar"   r#   )r&   r&   r%   r&   r%   r%   r&   r(   r%   r$   rX   r'   fp8_gemm_cutlass_r*   r+   r,   r1   r5   fp8_gemm_cutlassr8   r:   r;   r>   rI   s                r   !gen_gemm_sm100_module_cutlass_fp8rb      s   25QQMK----#&;;L 
g),DD	E	E 6"OAFFHH55%v.

 
 
 $2 	6 	6E5%# 6 6!L%LL%LL%LL%LLLM  ##I...*11	 2   #9f55556	66 6 6 6 6 6 6 6 6 6 6 6 6 6 64 -@!-  J $


 

 
 
 
rU   c                  L   t           j        dz  } t          j        | d           t           j        dz  g}t          t           j        dz            5 }t          j        |                                          }ddg}g d}|D ][\  }}}|D ]R}	| d	|	 d
| d
| d
| d	z  }
|	                    |
           |
                    |	|||          }t          |
|           S\	 d d d            n# 1 swxY w Y   t          j        g d          }t          d||dgz   dg          S )Ngen_gemm_sm100_cutlass_bf16Tr    zbf16_gemm_cutlass.cuzbf16_gemm_cutlass.jinjar"   r#   )r]   r^   r_   r$   rX   bf16_gemm_cutlass_r*   r+   r,   r1   r5   bf16_gemm_cutlassr8   r:   r;   r>   rI   s                r   "gen_gemm_sm100_module_cutlass_bf16rg      s   25RRMK----#&<<L 
g),EE	F	F 6!"OAFFHH55%v.

 
 
 $2 	6 	6E5%# 6 6!M5MM5MM5MM5MMMN  ##I...*11	 2   #9f55556	66 6 6 6 6 6 6 6 6 6 6 6 6 6 62 -@!-  J $'88
	   rU   c                  n   t           j        dz  } t          j        | d           g }dD ]4}t	          t           j        | dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          j
        t          j        g}t          j        t          j        g}ddg}dd	g}t          ||||          D ]\  }	}
}}t          |	         }t          |
         }| | d
| d
| d| d| d
z  }|                    |           |                    t$          |	         t$          |
         ||          }t'          ||           6d}t	          t           j        | dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          j
        t          j        g}t          j        t          j        g}dd	g}ddg}t          ||||          D ]\  }}}}t          |         }t          |         }| | d
| d
| d| d| d
z  }|                    |           |                    t$          |         dt$          |         ||          }t'          ||           dD ]w}t           j        |z  }| |z  }|                    |           t	          |d          5 }|                                }d d d            n# 1 swxY w Y   t'          ||           xt)          j        g d          }t-          d||          S )Ngen_gemm_sm100Tr    )gemm_groupwisegroup_gemm_fp8_groupwisez_sm100_kernel_inst.jinjatruefalse   r   r*   _major_mmaz	_sm100.cu)dtype_in	dtype_outscale_major_kmma_smgroup_gemm_mxfp4_groupwise_swapzcutlass::float_e2m1_t)dtype_adtype_bdtype_drt   swap_ab)zgemm_groupwise_sm100.cuz!group_gemm_fp8_groupwise_sm100.cuz#group_gemm_mxfp4_groupwise_sm100.cuzgemm_sm100_binding.cuzgroup_gemm_sm100_binding.curr1   r5   
gemm_sm100r<   )r   r?   r@   rA   rB   r   rC   rD   rE   torchfloat8_e4m3fnfloat8_e5m2float16bfloat16r   r   rF   rG   r   r   r   rH   r
   )rJ   rK   prefixrL   rM   dtype_in_listdtype_out_listscale_major_k_listmma_sm_listrq   rr   rs   rt   name_dtype_inname_dtype_outrQ   rR   dtype_a_listdtype_d_listswap_ab_listrw   ry   rz   name_dtype_aname_dtype_dfilenamesrc_pathrS   s                               r   gen_gemm_sm100_moduler      st   25EEMK----L@ 2 2'V*M*M*MM
 
 	: & 9 9	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: ,e.?@-8$g.!f:A>+={;
 ;
 	2 	26Hi 4H=M4Y?NhhmhhnhhMhhW]hhhi  	***&--*84+I6+	 .  F y&1111!	2" *F	g)v,O,O,OO	P	P 6TU"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6'):;LM5>2La&KG$L-4lK. . . .)&' /w7.w7ZZ,ZZZZ6ZZZZZ[ 	 	I&&&"))%g.+%g. * 
 
 	9f---- . . .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----,@!-  J $   s6   'A??B	B	'F44F8;F8K22K6	9K6	c            
         t           j        dz  } |                     dd           g }d}t          j        t          j        g}t          j        t          j        g}ddg}t          t           j	        | dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          |||          D ]\  }}	}
t          |         }t          |	         }| | d| d| d	|
 d
z  }|                    |           |                    t"          |         t"          |	         |
          }t%          ||           d}t          t           j	        | dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          |||          D ]\  }}	}
t          |         }t          |	         }| | d| d| d	|
 d
z  }|                    |           |                    t"          |         t"          |	         |
          }t%          ||           dD ]w}t           j	        |z  }| |z  }|                    |           t          |d          5 }|                                }d d d            n# 1 swxY w Y   t%          ||           xt'          j        dg          }t+          d||          S )Ngen_gemm_sm120T)parentsr!   rj   rl   rm   z_sm120_kernel_inst.jinjar*   ro   z	_sm120.cu)rq   rr   rs   rk   )zgemm_groupwise_sm120.cuz!group_gemm_fp8_groupwise_sm120.cuzgemm_sm120_binding.cuzgroup_gemm_sm120_binding.cur{   r4   r5   
gemm_sm120r}   )r   r?   mkdirr~   r   r   r   r   rB   r   rC   rD   rE   r   r   rF   rG   r   r   r   rH   r
   )rJ   rK   r   r   r   r   rL   rM   rq   rr   rs   r   r   rQ   rR   r   r   rS   s                     r   gen_gemm_sm120_moduler   >  s   25EEMt444L F(%*;<MmU^4N '* 
g)v,O,O,OO	P	P 6TU"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 /6/ / . .*)]
 090;XX-XX.XXXXXY 	 	I&&&"))&x0'	2' * 
 

 	9f---- (F	g)v,O,O,OO	P	P 6TU"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 /6/ / . .*)]
 090;XX-XX.XXXXXY 	 	I&&&"))&x0'	2' * 
 

 	9f---- . . .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----,@"
  J $   s6   >'B11B58B53'F&&F*-F*J%%J)	,J)	c            
         t           j         d} d}t           j         d}t          |t          j                  }|sJ d|             t	          |          }t          |  d| d|          }|sJ | d            t          dt          j        d	z  gd
dddt           j         dgt          z   t          j	        | z  g          S )N/includeflashinferMetaInfo/checksums.txt!Failed to get checksums.txt from /.h.h not foundtrtllm_gemmztrtllm_gemm_runner.cu-DTLLM_GEN_EXPORT_INTERFACE-DTLLM_GEN_EXPORT_FLASHINFER-DTLLM_ENABLE_CUDA-DTLLM_GEN_GEMM_CUBIN_PATH=\"\"r<   extra_include_paths
r   TRTLLM_GEN_GEMMr   r   r   r
   r   r   r   FLASHINFER_CUBIN_DIRinclude_pathheader_namechecksum_pathchecksum	meta_hashmetainfos         r   gen_trtllm_gen_gemm_moduler     s    #2<<<L&K $3CCCM(DEEHHHHHHHH8h''I ))+))) H
 11111118'*AA	
 ** N\-INNN	
  %9LHI   r   FrP   use_sm_100fc                    | t           j        t           j        fvrt          d|  d          | t           j        k    rdnd}d| }t          j        d| z  }t          j        |d           t          j        d	z  g}t          t          j        d
z            5 }t          j        |                                          }ddd           n# 1 swxY w Y   g d}|D ]V\  }	}
}|d| d|	 d|
 d| d	z  }|                    |           |                    |	|
||          }t          ||           Wt!          |||rddgt"          z   nt$          t          j        t          j        g          S )a  
    Generate TGV GEMM module for SM100 architecture.

    Args:
        dtype: Data type for the GEMM operation (torch.bfloat16 or torch.float16)
        use_sm_100f: Whether to compile with SM100f flags (default: False), which makes the compiled kernel
            compatible with both B200 and B300 GPUs. However, it's only available with CUDA 12.9+.

    Returns:
        JitSpec for the TGV GEMM module
    zUnsupported dtype z*. Only bfloat16 and float16 are supported.bf16fp16	tgv_gemm_gen_tgv_gemm_Tr    ztgv_gemm.cuztgv_gemm.jinjaN))r&         )r&   r   r   )r&   r   r2   )r&   r   r4   )r&      r   )r&   r   r   )r&   r   r2   )r&       r   )r&   r   r   )r&   r&   r   )r%   r   r   r*   xr+   )r.   r/   	dma_stagerP   z--expt-relaxed-constexprz -DCUTLASS_ENABLE_GDC_FOR_SM100=1r   )r~   r   r   
ValueErrorr   r?   r@   rA   r   rB   rC   rD   rE   rF   rG   r   r
   r   r   FLASHINFER_INCLUDE_DIR)rP   r   	dtype_strmodule_namerJ   rK   rL   rM   cta_m_n_dma_listr.   r/   r   rQ   rR   s                 r   gen_tgv_gemm_sm10x_moduler     s    U^U]333RRRR
 
 	
  5>11vI)i))K25PY5P5PPMK----#m3L
 
g),<<	=	= 6"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6   $4 . .uiR	RRERRERRIRRRR 	 	I&&&"))u	 * 
 
 	9f---- &.
 	  *'
   s   'CCCc                  :   t           j        dz  } t          j        | d           g }t	          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          j
        t          j
        ft          j        t          j        ft          j        t          j
        ft          j        t          j
        ft          j        t          j        ft          j        t          j        ffD ]}\  }}t          |         }t          |         }| d| d| dz  }|                    |           |                    t"          |         t"          |                   }	t%          ||	           ~d	D ]w}
t           j        |
z  }| |
z  }|                    |           t	          |d
          5 }|                                }	d d d            n# 1 swxY w Y   t%          ||	           xt'          d|t(                    S )Ngen_gemm_sm90Tr    z!group_gemm_sm90_kernel_inst.jinjagroup_gemm_r*   z_sm90.cu)rq   rr   )zgroup_gemm_sm90.cuzflashinfer_gemm_sm90_binding.cur{   	gemm_sm90r}   )r   r?   r@   rA   rB   r   rC   rD   rE   r~   r   r   r   r   r   rF   rG   r   r   r
   r   )rJ   rK   rL   rM   rq   rr   r   r   rQ   rR   r   r   s               r   gen_gemm_sm90_moduler     s{   2_DMK----L	g),OO	P	P 6TU"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 
&	(		em,		EM*		en-		EN+  . .) 090;R-RR.RRRR 	 	I&&&"))&x0'	2 * 
 
 	9f---- 	. 	. .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----*   s$   'A77A;>A;	G**G.	1G.	c            
         t           j         d} d}t           j         d}t          |t          j                  }|sJ d|             t	          |          }t          |  d| d|          }|sJ | d            t          dt          j        d	z  gd
dddt           j         dgt          z   t          j	        | z  gdg          S )Nr   r   r   r   r   r   r   trtllm_low_latency_gemmz!trtllm_low_latency_gemm_runner.cur   r   r   r   r   z-lcuda)r<   r   r   r   r   s         r   "gen_trtllm_low_latency_gemm_moduler   -  s   "2<<<L&K $3CCCM(DEEHHHHHHHH8h''I ))+))) H
 11111118!'*MM	
 ** N\-INNN	
  %9LHIj   r   )(__doc__r@   	itertoolsr   rC   r~   	artifactsr   r    r   r   corer	   r
   r   r   r   r   cubin_loaderr   r   utilsr   r   r   r   rT   rZ   rb   rg   r   r   r   r   rP   boolr   r   r   r   r   r   <module>r      s_     
			         3 3 3 3 3 3 3 3                      4 3 3 3 3 3 3 3 R R R R R R R R R R	 	 	 	 	-7 - - - -`+7 + + + +\/7 / / / /d+G + + + +\Lw L L L L^Rw R R R Rj"G " " " "L UH H;H6:HH H H HV'g ' ' ' 'T G            r   