
    )`i              +          d Z ddlZddlmZ ddlZddlZddlmZ ddl	m
Z
mZmZmZmZ ddlmZmZ dd	lmZmZmZmZmZ d
dlmZ d
dlmZ dej        dej        dej        dedededededefdZdej        dej        dej        dej        dedededededefdZ dedej        dej        dej        dej        dedededefdZ!dedej        dej        dej        dej        dededede
fdZ"dej        dej        dej        dej        dededededefd Z#dej        dej        dej        dej        d!ed"ededed#ede
fd$Z$dedej        dej        dej        dededededed%edefd&Z%dej        dej        dej        d!ed'ed(ed)ed%edej        d*ed+ed,edefd-Z&dedej        dej        dej        dej        dededededed%edefd.Z'dedej        dej        dej        dej        dededededefd/Z(dej        dej        dej        dej        dedededededefd0Z)dej        dej        dej        dededededede
fd1Z*dedej        dej        dej        dededededed%ede
fd2Z+dej        dej        dej        d!ed'ed(ed)ed%edej        d*ed+ed,ede
fd3Z,dej        dej        dej        d!ed'ed(ed)ed%edej        d*ed+ed,ede
fd4Z-	 	 	 	 	 	 	 dRd6edej        dej        dej        dej        d!ed7ee         d8ee         d9ee         d:ee         d;ed<ed=ed'ed(ed)ed*ed+ed,ed%ede
f*d>Z.	 	 	 	 	 	 	 dRd6edej        dej        dej        dej        d!ed7ee         d8ee         d9ee         d:ee         d;ed<ed=ed'ed(ed)ed*ed+ed,ed%ede
f*d?Z/dej        dej        dej        dej        dededededede
fd@Z0dedej        dej        dej        dej        dededededed%ede
fdAZ1dedej        dej        dej        dej        dedededede
fdBZ2dej        dej        dej        dej        dededededefdCZ3	 	 	 dSd6edej        dej        dej        deded7ee         d8ee         d9ee         d:ee         dDed=edededede
f dEZ4	 	 	 	 	 dTded6edej        dej        dej        deded7ee         d8ee         d9ee         d:ee         dDed=edededed%edFede
f&dGZ5	 	 	 dSd6edej        dej        dej        dHej        deded7ee         d8ee         d9ee         d:ee         dDed=edededede
f"dIZ6	 	 	 	 	 dTded6edej        dej        dej        dHej        deded7ee         d8ee         d9ee         d:ee         dDed=edededed%edFede
f(dJZ7dej        dej        dej        dej        dedededededefdKZ8dej        dej        dej        dej        dededededede
fdLZ9dM Z:	 	 	 dSd6edej        dej        dej        dHej        deded7ee         d8ee         d9ee         d:ee         dDed=edededef dNZ;dO Z<dP Z=de
fdQZ>dS )Ua3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)List   )env)JitSpecgen_jit_specloggersm90a_nvcc_flagscurrent_compilation_context   )	get_cubinget_meta_hash)	dtype_mapfilename_safe_dtype_mapmask_mode_literalpos_encoding_mode_literalwrite_if_different   )generate_additional_params)enumerate_kernelsdtype_qdtype_kvdtype_ohead_dim_qkhead_dim_vopos_encoding_modeuse_sliding_windowuse_logits_soft_capreturnc                 x    dt           |           dt           |          dt           |          d| d| d| d| d| S )	N$single_decode_with_kv_cache_dtype_q_
_dtype_kv_	_dtype_o__head_dim_qk__head_dim_vo__posenc_	_use_swa__use_logits_cap_r   )r   r   r   r   r   r   r   r   s           t/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/jit/attention/modules.pyget_single_decode_urir*   +   s    	0/Fw/O 	0 	0+H5	0 	0*73	0 	0 #	0 	0 #		0 	0
 $	0 	0 &	0 	0 .	0 	0	    	dtype_idxc	                     dt           |           dt           |          dt           |          dt           |          d| d| d| d| d	| S )
N#batch_decode_with_kv_cache_dtype_q_r!   r"   _dtype_idx_r#   r$   r%   r&   r'   r(   	r   r   r   r,   r   r   r   r   r   s	            r)   get_batch_decode_urir1   A   s    	0.Eg.N 	0 	0+H5	0 	0*73	0 	0 -Y7	0 	0 #		0 	0
 #	0 	0 $	0 	0 &	0 	0 .	0 	0
r+   backendhead_dim_ckvhead_dim_kpeuse_profilerc                     dt           |          dt           |          dt           |          dt           |          d| d| d| | dk    rd	nd
z   S )Nbatch_mla_attention_dtype_q_r!   r"   r/   _head_dim_ckv__head_dim_kpe_
_profiler_fa3_sm90 r(   )r2   r   r   r   r,   r3   r4   r5   s           r)   get_batch_mla_urir>   Y   s    	#'>w'G 	# 	#+H5	# 	#*73	# 	# -Y7	# 	# %		# 	#
 %	# 	# !	# 	# u$$". .r+   c                     | dk    rt          d          t          | |||||||          }t          j        |z  }	t	          j        |	d           | dk    r2t          t          j        dz            5 }
t          j	        |

                                          }d d d            n# 1 swxY w Y   |	dz  }t          ||                    t          |         t          |         t          |         t          |         ||                     g }d	D ]w}t          j        |z  }|	|z  }|                    |           t          |d
          5 }
|

                                }d d d            n# 1 swxY w Y   t          ||           xnJ| dk    r1t          t          j        dz            5 }
t          j	        |

                                          }d d d            n# 1 swxY w Y   |	dz  }t          ||                    t          |         t          |         t          |         t          |         ||                     g }dD ]w}t          j        |z  }|	|z  }|                    |           t          |d
          5 }
|

                                }d d d            n# 1 swxY w Y   t          ||           xnt          d|            g }| dk    r
|t          z  }|r|dgz  }t!          |||          S )Nauto4backend should not be auto when jit_args is providedTexist_okfa2zbatch_mla_config.jinjazbatch_mla_config.inc)r   r   r   r,   r3   r4   )zbatch_mla_plan.cuzbatch_mla_run.cuzbatch_mla_binding.curr;   zbatch_mla_sm90_config.inc)zbatch_mla_sm90_plan.cuzbatch_mla_sm90_run.cuzbatch_mla_sm90_binding.cuzUnsupported backend: -DFLASHINFER_ENABLE_PROFILERextra_cuda_cflags)
ValueErrorr>   jit_envFLASHINFER_GEN_SRC_DIRosmakedirsopenFLASHINFER_CSRC_DIRjinja2Templatereadr   renderr   appendr	   r   )r2   r   r   r   r,   r3   r4   r5   urigen_directoryfconfig_templgenerated_config_pathsource_pathsfilenamesrc_path	dest_pathsourcerH   s                      r)   gen_batch_mla_moduler_   n   sD    &OPPP
	 	C 2S8MK----%'-0HHII 	5Q!?1668844L	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 -0F F!!'*"8,!'*#I.))    
	
 
	
 
	
 
 
	2 
	2H
 2X=H%0I	***h$$ "" " " " " " " " " " " " " " "y&1111
	2 
E		'-0HHII 	5Q!?1668844L	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 -0K K!!'*"8,!'*#I.))    
	
 
	
 
	
 
 
	2 
	2H
 2X=H%0I	***h$$ "" " " " " " " " " " " " " " "y&1111
	2 :::;;;%-- ><==+   sH   4'B''B+.B+E,,E0	3E0	-'G  G$'G$J%%J)	,J)	arcc                     dt           |           dt           |          dt           |          dt           |          d| d| d| d| S )	N'batch_decode_mla_with_kv_cache_dtype_q_r!   r"   r/   _head_dim_ckvr&   r'   _arc_r(   )r   r   r   r,   r3   r   r   r`   s           r)   get_batch_decode_mla_urire      s    	2I'2R 	 	+H5	 	*73	 	 -Y7	 	 $		 	
 &	 	 .	 	 	 		r+   head_dimnum_qo_headsuse_tensor_coresc	                    t           j                            d          j        }	|	dk    rd}
nd}
|rV|	dk    rP||
z  dk    rG| t           j        k    r7|t           j        k    r'|t           j        k    rt          j        d           d}nt          j        d           d	}t          | |||||||          }t          j	        |z  }t          j        |d
           t          t          j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   |dz  }t#          ||                    t&          |          t&          |         t&          |         t&          |         ||dz  |
t)          |                                          t)          |                                          	  	                   g }|dk    rddg}ng d}g }|D ]w}t          j        |z  }||z  }|                    |           t          |d          5 }|                                }d d d            n# 1 swxY w Y   t#          ||           xt/          ||          S )Nr   	      @      z2Use tensor-core SM80 version of MLA decode kernel.sm80z4Fall back to cuda-core version of MLA decode kernel.	cuda_coreTrB   zbatch_decode_mla_config.jinjazmla_config.inc)	r   r   r   r,   r3   r4   qo_tile_lenr   r   zbatch_decode_mla_cute_sm80.cubatch_decode_mla_binding.cu)zbatch_decode_mla_plan.cuzbatch_decode_mla_run.curq   rE   )torchcudaget_device_propertiesmajorfloat16r   infore   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   r   rS   r   strlowerrT   r   )r   r   r   r,   rf   rg   r   r   rh   cuda_arch_majorrp   r`   rU   rV   rW   rX   rY   	filenamesrZ   r[   r\   r]   r^   s                          r)   gen_batch_decode_mla_moduler|      s    j66q99?O! 	q  ;&!++u}$$%%u}$$HIIIJKKK
"	 	C 2S8MK----	g),KK	L	L 1PQqvvxx001 1 1 1 1 1 1 1 1 1 1 1 1 1 1),<<g&x(g&	*!!Q#"#566<<>> #$7 8 8 > > @ @ 	 
	
 
	
   I
f}}+)
		

 
 
	 L . ..9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----\***s$   5'D((D,/D,!II		I	use_fp16_qk_reductionc
                     dt           |          dt           |          dt           |          d| d| d| d| d| d	|	 | d
k    rdndz   S )N%single_prefill_with_kv_cache_dtype_q_r!   r"   r#   r$   r%   r&   r'   _f16qk_r;   r<   r=   r(   )
r2   r   r   r   r   r   r   r   r   r}   s
             r)   get_single_prefill_urir   <  s    	)0G0P 	) 	)+H5	) 	)*73	) 	) #	) 	) #		) 	)
 $	) 	) &	) 	) .	) 	) '	) 	) 8?%7G7GGGR	Q
r+   pos_encoding_mode_puse_sliding_window_puse_logits_soft_cap_ppos_encoding_mode_duse_sliding_window_duse_logits_soft_cap_dc                     dt           |           dt           |          dt           |          d| d| d| d| d|	 d	|
 d
| dt           |          d| S )Npod_with_kv_cache_dtype_q_r!   r"   
_head_dim_
_posenc_p__use_swa_p__use_logits_cap_p_
_posenc_d__use_swa_d__use_logits_cap_d_r/   r   r(   )r   r   r   rf   r   r   r   r}   r,   r   r   r   s               r)   get_pod_urir   U  s    	)%<W%E 	) 	)+H5	) 	)*73	) 	) 	) 	) (		) 	)
 *	) 	) 2	) 	) (	) 	) *	) 	) 2	) 	) -Y7	) 	) '	) 	)r+   c                     dt           |          dt           |          dt           |          dt           |          d| d| d| d| d	|	 d
|
 | dk    rdndz   S )N$batch_prefill_with_kv_cache_dtype_q_r!   r"   r/   r#   r$   r%   r&   r'   r   r;   r<   r=   r(   )r2   r   r   r   r,   r   r   r   r   r   r}   s              r)   get_batch_prefill_urir   s  s    		)/Fw/O 		) 		)+H5		) 		)*73		) 		) -Y7		) 		) #			) 		)
 #		) 		) $		) 		) &		) 		) .		) 		) '		) 		) 8?%7G7GGGR		Qr+   c	                     dt           |          dt           |          dt           |          dt           |          d| d| d| d| d	k    rd
ndz   S )N3batch_prefill_with_attention_sink_kv_cache_dtype_q_r!   r"   r/   r#   r$   r&   _r;   r<   r=   r(   )	r2   r   r   r   r,   r   r   r   r   s	            r)   $get_batch_prefill_attention_sink_urir     s    	)>UV]>^ 	) 	)+H5	) 	)*73	) 	) -Y7	) 	) #		) 	)
 #	) 	) &	) 	) 	) 8?%7G7GGGR	Qr+   c	                    dt           |           dt           |          dt           |          dt           |          d| d| d| dt          |                                           d	t          |                                           S )
N&batch_attention_with_kv_cache_dtype_q_r!   r"   r/   r#   r$   r%   _use_logits_soft_cap__use_profiler_)r   rx   ry   )	r   r   r   r,   r   r   r   r   r5   s	            r)   get_batch_attention_urir     s    	41H1Q 	4 	4+H5	4 	4*73	4 	4 -Y7	4 	4 #		4 	4
 #	4 	4 $	4 	4  ##677==??	4 	4 L))//11	4 	4
r+   c                 L   t          | |||||||          }t          || ||||dgdgg dg ddt          |                                           dt          |                                           dt          |dk                                               dd	|||
          S Nmaybe_alibi_slopesfloatlogits_soft_capsm_scalerope_rcp_scalerope_rcp_thetadoubler   r   r   zDefaultAttention<false, , r   >+#include<flashinfer/attention/variants.cuh>)r   r   r   )r*   "gen_customize_single_decode_modulerx   ry   )	r   r   r   r   r   r   r   r   rU   s	            r)   gen_single_decode_moduler     s#     	 	C .				
 	
 	
 	100 	Q3'9#:#:#@#@#B#B  	Q  	QcJ]F^F^FdFdFfFf  	Q  	Qjmn  DE  oE  kF  kF  kL  kL  kN  kN  	Q  	Q  	Q5+-/)   r+   c
                 d   t          | |||||||||	
  
        }
|t          j        t          j        fv }| dk    r|r
J d            ddg}ddg}g d}g d}d	t	          |                                           d
t	          |                                           d
t	          |dk                                               d}d}nL|s6dg}dg}g d}g d}dt	          |                                           d}d}ng d}g d}g d}g d}d}d}t          | |
|||||||||||||||	|          S )NrD   /fp8 tensor core is not supported in fa2 backendmaybe_custom_maskr   uint8_tr   r   r   "DefaultAttention<use_custom_mask, r   r   r   r   maybe_scale_v)r   r   scale_v_scalar)r   r   r   DefaultAttention<2#include<flashinfer/attention/hopper/variants.cuh>maybe_scale_qmaybe_scale_kr   r   r   r   r   scale_q_scalarscale_k_scalarr   DefaultFP8Attentionr   r   r   r}   fp8_enabled)r   rr   float8_e4m3fnfloat8_e5m2rx   ry   #gen_customize_single_prefill_module)r2   r   r   r   r   r   r   r   r   r}   rU   r   additional_tensor_namesadditional_tensor_dtypesadditional_scalar_namesadditional_scalar_dtypesvariant_namevariant_decls                     r)   gen_single_prefill_moduler     sD    ! C  e153DEEK%QQ QQQ#68L"M$-w#7 #
 #
 #
 $L#K#K  jC@R<S<S<Y<Y<[<[  j  j_bcv_w_w_}_}__  j  j  DG  HY  ]^  H^  D_  D_  De  De  Dg  Dg  j  j  jD 	P'6&7#(/y$&W&W&W#'E'E'E$Rs3F/G/G/M/M/O/ORRRLOLL' ' '#
 (C'B'B$' ' '# (P'O'O$0LOL.  +-/3%   r+   c                 T   t          | |||||||||	|
|          }ddg}ddg}g d}g d}dt          |                                           dt          |                                           dt          |d	k                                               d
}dt          |
                                           dt          |                                           dt          |	d	k                                               d
}d}t          || |||||||||||||||	|
||          S )Nr   r   r   r   r   r   r   r   r   $DefaultAttention<use_custom_mask_p, r   r   r   $DefaultAttention<use_custom_mask_d, r   r   r   r   r   r   r   r}   )r   rx   ry   gen_customize_pod_moduler   r   r   rf   r   r   r   r}   r,   r   r   r   rU   r   r   r   r   variant_name_pvariant_name_dr   s                       r)   gen_pod_moduler   B  s7     C  34HI )73    DCC pC@T<U<U<[<[<]<]  p  padeza{a{  bB  bB  bD  bD  p  p  HK  L_  cd  Ld  He  He  Hk  Hk  Hm  Hm  p  p  pN pC@T<U<U<[<[<]<]  p  padeza{a{  bB  bB  bD  bD  p  p  HK  L_  cd  Ld  He  He  Hk  Hk  Hm  Hm  p  p  pN@L#  /13/133)   r+   c                 Z   dt          | |||||||||	|
|          z   }ddg}ddg}g d}g d}dt          |                                           d	t          |                                           d	t          |d
k                                               d}dt          |
                                           d	t          |                                           d	t          |	d
k                                               d}d}t          || |||||||||||||||	|
||          S )Nbatch_r   r   r   r   r   r   r   r   r   r   r   r   r   )r   rx   ry   gen_customize_batch_pod_moduler   s                       r)   gen_batch_pod_moduler     s>    [  C  34HI )73    DCC pC@T<U<U<[<[<]<]  p  padeza{a{  bB  bB  bD  bD  p  p  HK  L_  cd  Ld  He  He  Hk  Hk  Hm  Hm  p  p  pN pC@T<U<U<[<[<]<]  p  padeza{a{  bB  bB  bD  bD  p  p  HK  L_  cd  Ld  He  He  Hk  Hk  Hm  Hm  p  p  pN@L)  /13/133)   r+   FrU   r   r   r   r   r   r   r   c                 >   t           j        | z  }t          ||||	          \  }}}t          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   i d|d|d|d|d|
d|d	t          |         d
t          |         dt          |         dt          |         d|d|dt          |         dt          |         dt          |                                          dt          |                                          dt          |                                          t          |                                          t          |                                          d} |j        d i |}t          j        |d           |dz  }t          ||           g }dD ]g}dD ]b} t           |         |d<   t           |          |d<   d| d|  d}!||!z  }"|                    |"            |j        d i |}#t          |"|#           chdD ]w}!t           j        |!z  }$||!z  }"|                    |"           t          |$d          5 }|                                }#d d d            n# 1 swxY w Y   t          |"|#           xt%          | |          S )!Nzpod_customize_config.jinjazpod_kernel_inst.jinjaadditional_func_paramsadditional_params_decladditional_params_setterr   r   r   r   r   r   idtyper   r   r   r   r   r   r   r   r}   TrB   zpod_config.incr   r   r   r   mask_mode_pmask_mode_dpod_kernel_mask_p_d.cu)zpod.cuzpod_jit_binding.curE    rJ   rK   r   rN   rO   rP   rQ   rR   r   r   rx   ry   rS   rL   rM   r   r   rT   r   %rU   r   r   r   r,   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   rV   r   r   r   rW   rX   kernel_inst_templkwargsgenerated_inc_strrY   rZ   r   r   r[   r]   r^   r\   s%                                        r)   r   r     s   , 2S8M 	#  		 		  
g),HH	I	I 1Qqvvxx001 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
g),CC	D	D 6"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 "8 "8 	#$< 		
 	. 	. 	9W% 	Ih' 	9W% 	)I& 	x 	x 	89LM 	89LM 	$8 9 9 ? ? A A  	 %:!;!;!A!A!C!C!" 	$8 9 9 ? ? A A#$ "%%:!;!;!A!A!C!C!$%:!;!;!A!A!C!C'  F, ,+  
  K----),<<,.?@@@L# 2 2' 
	2 
	2K$5k$BF=!$5k$BF=!J+JJJJJH%0I	***-&-   F y&1111
	2 	. 	. .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----\***6   'A55A9<A9'CCCK33K7	:K7	c                 >   t           j        | z  }t          ||||	          \  }}}t          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   i d|d|d|d|d|
d|d	t          |         d
t          |         dt          |         dt          |         d|d|dt          |         dt          |         dt          |                                          dt          |                                          dt          |                                          t          |                                          t          |                                          d} |j        d i |}t          j        |d           |dz  }t          ||           g }dD ]g}dD ]b} t           |         |d<   t           |          |d<   d| d|  d}!||!z  }"|                    |"            |j        d i |}#t          |"|#           chdD ]w}!t           j        |!z  }$||!z  }"|                    |"           t          |$d          5 }|                                }#d d d            n# 1 swxY w Y   t          |"|#           xt%          | |          S )!Nz batch_pod_customize_config.jinjazbatch_pod_kernel_inst.jinjar   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   TrB   zbatch_pod_config.incr   r   r   batch_pod_kernel_mask_r   r   )zbatch_pod.cuzbatch_pod_jit_binding.curE   r   r   r   s%                                        r)   r   r   (  s   , 2S8M 	#  		 		  
g),NN	O	O 1STqvvxx001 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
g),II	J	J 6a"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 "8 "8 	#$< 		
 	. 	. 	9W% 	Ih' 	9W% 	)I& 	x 	x 	89LM 	89LM 	$8 9 9 ? ? A A  	 %:!;!;!A!A!C!C!" 	$8 9 9 ? ? A A#$ "%%:!;!;!A!A!C!C!$%:!;!;!A!A!C!C'  F, ,+  
  K----),BB,.?@@@L# 2 2' 
	2 
	2K$5k$BF=!$5k$BF=!PPP{PPPH%0I	***-&-   F y&1111
	2 	. 	. .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----\***r   c	                 P   t          | ||||||||	  	        }	t          |	| |||||dgdgg dg ddt          |                                           dt          |                                           dt          |dk                                               dd	|||
          S r   )r1   !gen_customize_batch_decode_modulerx   ry   )
r   r   r   r,   r   r   r   r   r   rU   s
             r)   gen_batch_decode_moduler     s)    
 
C -				
 	
 	
 	100 	Q3'9#:#:#@#@#B#B  	Q  	QcJ]F^F^FdFdFfFf  	Q  	Qjmn  DE  oE  kF  kF  kL  kL  kN  kN  	Q  	Q  	Q5+-/+   r+   c                    t          | |||||||||	|
          }|t          j        t          j        fv }| dv sJ d|              |t          j        t          j        fvs
J d            | dk    r|r
J d            g d}g d}g d}g d	}d
t	          |                                           dt	          |	                                           dt	          |dk                                               d}d}nN|s8g d}g d}g d}g d}dt	          |	                                           d}d}ng d}g d}g d}g d}d}d}t          | ||||||||||||||||	|
|          S )N)rD   r;   z?backend must be fa2 or fa3 in gen_batch_prefill_module(), got: z3FP8 output is not supported in fa2/fa3 backends yetrD   r   )r   maybe_mask_indptrr   maybe_prefix_len_ptrmaybe_token_pos_in_items_ptrmaybe_max_item_len_ptr)r   int32_tr   uint32_tuint16_tr   )r   r   r   r   token_pos_in_items_len)r   r   r   r   int64_tr   r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   )r   rr   r   r   rx   ry   "gen_customize_batch_prefill_module)r2   r   r   r   r,   r   r   r   r   r   r}   rU   r   r   r   r   r   r   r   s                      r)   gen_batch_prefill_moduler     s      C" e153DEEKn$$$S'SS %$$ 5.0ABBBB= CBB %QQ QQQ#
 #
 #
$
 $
 $
 #
 #
 #
 $W#V#V  jC@R<S<S<Y<Y<[<[  j  j_bcv_w_w_}_}__  j  j  DG  HY  ]^  H^  D_  D_  De  De  Dg  Dg  j  j  jD  	P' ' '# (U'T'T$' ' '# (Q'P'P$Rs3F/G/G/M/M/O/ORRRLOLL' ' '#
 (C'B'B$' ' '# (P'O'O$0LOL-  +-/3'   r+   c	                     ddl m}	 t          | ||||||||	  	        }
t          | |
||||||dgdgdgdgd|	|          ||ddd	          S )
Nr   )attention_sink_declsinkr   r   r   AttentionSinkFr   )!flashinfer.jit.attention.variantsr   r   r   )r2   r   r   r   r,   r   r   r   r   r   rU   s              r)   'gen_batch_prefill_attention_sink_moduler  8  s     FEEEEE
.
 
C .					
G$+-!#'   r+   c	                     t          | ||||||||	  	        }	g }
g }g }g }dt          |                                           d}d}t          |	| ||||||
||||||||          S )NzStandardAttention<r   r   )r   r   r5   )r   rx   ry   $gen_customize_batch_attention_module)r   r   r   r,   r   r   r   r   r5   rU   r   r   r   r   r   r   s                   r)   gen_batch_attention_moduler  h  s     "
 
C *,*,)+*,K,?(@(@(F(F(H(HKKKL@L/  +/!!   r+   r   c                    t           j        | z  }t          ||||	          \  }}}t          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t           j        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   |||||
t          |         t          |         t          |         ||t          |         t          |                                          t          |                                          d} |j        d
i |}t          j        |d           g }|dz  }|                    |            |j        d
i |}t!          ||           dD ]w}t           j        |z  }||z  }|                    |           t          |d          5 }|                                }d d d            n# 1 swxY w Y   t!          ||           x|d	z  }t!          ||           t#          | |          S )Nz$single_decode_customize_config.jinjazsingle_decode_kernel_inst.jinja)r   r   r   r   r   r   r   r   r   r   r   r   r   TrB   zsingle_decode_kernel.cu)zsingle_decode.cuzsingle_decode_jit_binding.curE   zsingle_decode_config.incr   )rJ   rK   r   rN   rO   rP   rQ   rR   r   r   rx   ry   rS   rL   rM   rT   r   r   )rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rV   r   r   r   rW   rX   r   r   r   rZ   r]   r^   r[   r\   rY   s                                 r)   r   r     s;   " 2S8M 	#  		 		  
#&LL
 
 1	
qvvxx001 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 
g),MM	N	N 6RS"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 #9"8$<$$W%h'W%""67HI!"455;;=="#677==?? F  ,+  
  K----L 99I	"""%%  
 F y&))) 	. 	. .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----),FF,.?@@@\***s6   'A55A9<A9'CCC-HH	H	r   c                 	   ||t           |         t           |         t           |         ||t          |         t          |                                          t          |                                          t          |                                          d}| dk    rt	          d          | dk    rt
          j        |z  }t          |||	|
          \  }}}t          t
          j	        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t
          j	        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   ||||dz  } |j        di |}t          j        |d	           g }d
D ]L}d| d}||z  }|                    |            |j        ddt"          |         i|}t%          ||           MdD ]w}t
          j	        |z  } ||z  }|                    |           t          | d          5 }|                                }d d d            n# 1 swxY w Y   t%          ||           x|dz  }!t%          |!|           t'          ||          S | dk    rt
          j        |z  }t          |||	|
d          \  }}}d}"|rd}#d}$nd}#d}$t          t
          j	        |"z            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t
          j	        |#z            5 }t          j        |                                          }d d d            n# 1 swxY w Y   ||||dz  } |j        di |}t          j        |d	           g }d
D ]L}d| d}||z  }|                    |            |j        ddt"          |         i|}t%          ||           M|$dfD ]w}t
          j	        |z  } ||z  }|                    |           t          | d          5 }|                                }d d d            n# 1 swxY w Y   t%          ||           x|dz  }!t%          |!|           t'          ||t(                    S t	          d|            )N)r   r   r   r   r   r   r   r   r   r   r}   r@   rA   rD   z%single_prefill_customize_config.jinjaz single_prefill_kernel_inst.jinja)r   r   r   TrB   r   single_prefill_kernel_mask_.cu	mask_mode)zsingle_prefill.cuzsingle_prefill_jit_binding.curE   zsingle_prefill_config.incr;   is_sm90_templatez*single_prefill_sm90_customize_config.jinjaz)single_prefill_fp8_sm90_kernel_inst.jinjazsingle_prefill_fp8_sm90.cuz%single_prefill_sm90_kernel_inst.jinjazsingle_prefill_sm90.cu single_prefill_sm90_kernel_mask_z"single_prefill_sm90_jit_binding.cuzsingle_prefill_sm90_config.incrG   Invalid backend: r   r   r   rx   ry   rI   rJ   rK   r   rN   rO   rP   rQ   rR   rS   rL   rM   rT   r   r   r   r	   )%r2   rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   r   r   rV   r   r   r   rW   rX   r   r   rZ   r
  r[   r]   r^   r\   rY   _file_config_file_kernel_inst
_file_csrcs%                                        r)   r   r     s3   * %$W%h'W%""67HI!"455;;=="#677==??!$%:!;!;!A!A!C!C F &OPPP	E		6<&'('(	  	Q 68P '*QQ
 
 	5!?1668844L	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5
 '*LL
 
 	: & 9 9	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	:
 	&<&<(@
 
 	
 0L/ 
 

 
 	MD1111% 	2 	2ICYCCCH%0I	***-&-  +I6 F y&1111
 		2 		2H 2X=H%0I	***h$$ "" " " " " " " " " " " " " " "y&1111 -0K K02CDDDC...	E		6< ''('(!%   	S	!79Q D 	2 K5JJ G1J'-<== 	5!?1668844L	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 '-0AABB 	:a & 9 9	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	: 	&<&<(@
 
 	
 0L/ 
 

 
 	MD1111% 	2 	2IH)HHHH%0I	***-&-  +I6 F y&1111 0
 		2 		2H 2X=H%0I	***h$$ "" " " " " " " " " " " " " " "y&1111 -0P P02CDDD.
 
 
 	
 6W66777sl   5'D((D,/D,'FF	FI((I,	/I,	 'L33L7:L7'NNNQ55Q9	<Q9	r   c                    t           j        | z  }t          |||	|
          \  }}}|||||t          |         t          |         t          |         t          |         ||t          |         t          |                                          t          |                                          d}t          t           j        dz            5 }t          j
        |                                          }d d d            n# 1 swxY w Y   t          t           j        dz            5 }t          j
        |                                          }d d d            n# 1 swxY w Y    |j        di |}g }|dz  }|                    |            |j        di |}t          ||           dD ]w}t           j        |z  }||z  }|                    |           t          |d          5 }|                                }d d d            n# 1 swxY w Y   t          ||           x|dz  }t          ||           t          | |          S )	N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   z#batch_decode_customize_config.jinjazbatch_decode_kernel_inst.jinjazbatch_decode_kernel.cu)zbatch_decode.cuzbatch_decode_jit_binding.curE   zbatch_decode_config.incr   )rJ   rK   r   r   r   rx   ry   rN   rO   rP   rQ   rR   rS   rT   r   r   )rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rV   r   r   r   r   rW   rX   r   r   rZ   r]   r^   r[   r\   rY   s                                  r)   r   r     s(   $ 2S8M"#$#$		
 	
 O35M #9"8$<$$W%h'W%F#""67HI!"455;;=="#677==?? F" 
g),QQ	R	R 1VWqvvxx001 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
g),LL	M	M 6QR"OAFFHH556 6 6 6 6 6 6 6 6 6 6 6 6 6 6 ,+  
  L 88I	"""%%  
 F y&))) 	. 	. .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----),EE,.?@@@\***s6   'C;;C?C?"'EEE#HH	H	c                    ||t           |         t           |         t           |         t           |         ||t          |         t          |                                          t          |                                          t          |                                          d}| dk    rt	          d          | dk    rt
          j        |z  }t          ||	|
|          \  }}}t          t
          j	        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t
          j	        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t
          j	        dz            5 }t          j        |                                          }d d d            n# 1 swxY w Y   ||||dz  } |j        d"i |}t          j        |d	
           g }dD ]}|d| dz  }|                    |            |j        d"dt"          |         i|} t%          ||            |d| dz  }|                    |            |j        d"dt"          |         i|} t%          ||            dD ]w}!t
          j	        |!z  }"||!z  }|                    |           t          |"d          5 }|                                } d d d            n# 1 swxY w Y   t%          ||            x|dz  }#t%          |#|           t'          ||          S | dk    rt
          j        |z  }t          ||	|
|d	          \  }}}d}$|rd}%d}&d}'nd}%d}&d}'t          t
          j	        |$z            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t
          j	        |%z            5 }t          j        |                                          }d d d            n# 1 swxY w Y   t          t
          j	        |&z            5 }t          j        |                                          }d d d            n# 1 swxY w Y   ||||dz  } |j        d"i |}g }dD ]}d| d}!||!z  }|                    |            |j        d"dt"          |         i|} t%          ||            d| d}!||!z  }|                    |            |j        d"dt"          |         i|} t%          ||            |'dfD ]w}!t
          j	        |!z  }"||!z  }|                    |           t          |"d          5 }|                                } d d d            n# 1 swxY w Y   t%          ||            x|dz  }#t%          |#|           t'          ||t(                     S t	          d!|            )#N)r   r   r   r   r   r   r   r   r   r   r   r}   r@   rA   rD   z$batch_prefill_customize_config.jinjaz%batch_prefill_paged_kernel_inst.jinjaz&batch_prefill_ragged_kernel_inst.jinjar   r   r   TrB   r    batch_prefill_paged_kernel_mask_r	  r
  !batch_prefill_ragged_kernel_mask_)zbatch_prefill.cuzbatch_prefill_jit_binding.curE   zbatch_prefill_config.incr;   r  z)batch_prefill_sm90_customize_config.jinjaz.batch_prefill_fp8_paged_sm90_kernel_inst.jinjaz/batch_prefill_fp8_ragged_sm90_kernel_inst.jinjazbatch_prefill_fp8_sm90.cuz*batch_prefill_paged_sm90_kernel_inst.jinjaz+batch_prefill_ragged_sm90_kernel_inst.jinjazbatch_prefill_sm90.cu%batch_prefill_paged_sm90_kernel_mask_&batch_prefill_ragged_sm90_kernel_mask_z!batch_prefill_sm90_jit_binding.cuzbatch_prefill_sm90_config.incrG   r  r   r  )(r2   rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   r   r   rV   r   r   r   rW   rX   paged_kernel_inst_templragged_kernel_inst_templr   rZ   r
  r]   r^   r[   r\   rY   r  _file_paged_kernel_inst_file_ragged_kernel_instr  s(                                           r)   r   r     sS	   , %$W%h'W%F#""67HI!"455;;=="#677==??!$%:!;!;!A!A!C!C F &OPPP	E		6<&'('(	  	S	!79Q '*PP
 
 	5!?1668844L	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5
 '*QQ
 
 	@&,oaffhh&?&?#	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@
 '*RR
 
 	A'-qvvxx'@'@$	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A
 	&<&<(@
 
 	
 0L/ 
 

 
 	MD1111% 	2 	2I Q9 Q Q QQ  	***3,3  +I6 F y&111  RI R R RR  	***4-4  +I6 F y&1111
 		2 		2H 2X=H%0I	***h$$ "" " " " " " " " " " " " " " "y&1111 -0J J02CDDDC...	E		6<&'('(!%   	S	!79Q C 	1&V#'X$4JJ&R#'T$0J'-<== 	5!?1668844L	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 '-0GGHH 	@A&,oaffhh&?&?#	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ '-0HHII 	AQ'-qvvxx'@'@$	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	A 	&<&<(@
 
 	

 0L/99&99% 	2 	2IMyMMMH%0I	***3,3  +I6 F y&111N	NNNH%0I	***4-4  +I6 F y&1111 /
 		2 		2H 2X=H%0I	***h$$ "" " " " " " " " " " " " " " "y&1111 -0O O02CDDD.
 
 
 	
 6W66777s   'D44D8;D8'FFF5'G((G,/G,3LL	L	0'O##O'*O'
'P==QQ$'RRRV33V7	:V7	c	                     dS )Nfmha_cutlass_sm100ar   r0   s	            r)   get_fmha_cutlass_sm100a_urir     s
     ! r+   c	                     t          | ||||||||	  	        }	t          j        dz  t          j        dz  t          j        dz  g}
t          j        g d          }t          |	|
|          S )Nzfmha_cutlass_sm100.cuzfmha_cutlass_sm100_binding.cuzblackwell_fmha_plan.cu)
         supported_major_versionsrG   )r   rJ   rO   r
   get_nvcc_flags_listr   )r   r   r   r,   r   r   r   r   r   rU   rZ   
nvcc_flagss               r)   gen_fmha_cutlass_sm100a_moduler)    s     &
 
C 	#&==#&EE#&>>L -@!-  J $   r+   c            	         ddl m} m} | j         d}d}| j         d}t	          ||j                  }|sJ d|             t          |          }t	          | d| d|          }|sJ | d	            t          d
t          j        dz  t          j        dz  gt          j	        |z  gd| j         dd| dg          S )Nr   )ArtifactPathCheckSumHashz/includeflashInferMetaInfoz/checksums.txtz!Failed to get checksums.txt from /z.hz.h not foundfmha_genztrtllm_fmha_kernel_launcher.cuzfmhaReduction.cuz-DTLLM_GEN_FMHA_CUBIN_PATH=\"\"z -DTLLM_GEN_FMHA_METAINFO_HASH=\")extra_include_pathsrH   )
	artifactsr+  r,  TRTLLM_GEN_FMHAr   r   r   rJ   rO   FLASHINFER_CUBIN_DIR)r+  r,  include_pathheader_namechecksum_pathchecksum	meta_hashmetainfos           r)   gen_trtllm_gen_fmha_moduler;    s-   77777777"2<<<L&K $3CCCM(DEEHHHHHHHH8h''I))+))) H 11111118'*JJ'*<<	

 %9LHIN\-INNN>	>>>
   r+   c                    ||t           |         t           |         t           |         t           |         ||t          |         t          |                                          d
}t          j        | z  }t          |||	|
          \  }}}t          t          j        dz            5 }t          j
        |                                          }d d d            n# 1 swxY w Y   t          t          j        dz            5 }t          j
        |                                          }d d d            n# 1 swxY w Y   ||||dz  } |j        di |}t          j        |d           g }dD ]J}|d| d	z  }|                    |            |j        dd
t           |         i|}t#          ||           KdD ]w}t          j        |z  }||z  }|                    |           t          |d          5 }|                                }d d d            n# 1 swxY w Y   t#          ||           x|dz  }t#          ||           t%          | ||rdgng           S )N)
r   r   r   r   r   r   r   r   r   r   z&batch_attention_customize_config.jinjaz'batch_attention_paged_kernel_inst.jinjar  TrB   r   "batch_attention_paged_kernel_mask_r	  r
  )zbatch_attention.cuzbatch_attention_jit_binding.curE   zbatch_attention_config.incrF   rG   r   )r   r   rx   ry   rJ   rK   r   rN   rO   rP   rQ   rR   rS   rL   rM   rT   r   r   r   ) rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r5   r   rV   r   r   r   rW   rX   r  r   rZ   r
  r]   r^   r[   r\   rY   s                                    r)   r  r    s   & %$W%h'W%F#""67HI"#677==?? F 2S8M"#$#$		
 	
 O35M 
#&NN
 
 1	
qvvxx001 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 
#&OO
 
 <	
"(/!&&((";";< < < < < < < < < < < < < < <
 "8"8$<  F ,+  
  K----L! . .	!$W$W$W$WW	I&&&/(/ 
 
'	2

 
 	9f---- 	. 	. .9!H,	I&&&(C   	AVVXXF	 	 	 	 	 	 	 	 	 	 	 	 	 	 	9f----),HH,.?@@@>JR9::PR   s6   %'CCC?'D22D69D65HH	H	c                  b    ddl m}  t          dt          j        dz  gd| j         dg          S )Nr   r+  fmha_cudnn_genzcudnn_sdpa_kernel_launcher.cuz-DCUDNN_SDPA_CUBIN_PATH=\"r0  rG   )r2  r+  r   rJ   rO   
CUDNN_SDPAr?  s    r)   gen_cudnn_fmha_modulerB  b  sS    ))))))		$'F	FGF,*AFFF
   r+   c                  F    t                                                      } | S )N)gen_trtllm_fmha_v2_modulebuild_and_load)modules    r)   get_trtllm_fmha_v2_modulerG  n  s    &((7799FMr+   c                     d} t           j        | z  }|                    dd           t           j        dz  }t	          ||           g d}d |D             }t           j        dz  }||gz   }t          j        dg	          }|                    d
t           j        dz              |                    d           t          | ||          S )Ntrtllm_fmha_v2T)parentsrC   fmha_v2)z<fmha_v2_flash_attention_bf16_64_128_S_q_k_v_192x128_sm120.cuzLfmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_output_bf16_sm120.cuz@fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_k_v_192x128_sm120.cuc                 8    g | ]}t           j        d z  dz  |z  S )rI  	generated)rJ   FLASHINFER_JIT_DIR).0kernels     r)   
<listcomp>z-gen_trtllm_fmha_v2_module.<locals>.<listcomp>  s;        	"%55CfL  r+   ztrtllm_fmha_v2_binding.cur$  r%  z-Iz-Wno-deprecated-gpu-targetsrG   )	rJ   rN  mkdirrO   r   r
   r'  rT   r   )rU   
cached_opsfmha_v2_src_dirkernelskernel_pathsbinding_source_pathrZ   r(  s           r)   rD  rD  s  s   
C+c1JTD1111I=O oz222  G   L "58SS#6"77L,@"$  J D76BDDEEE3444$   r+   )r   FFr   FFF)r   FF)r   FFFF)?__doc__rL   typingr   rP   rr   r=   r   rJ   corer   r   r   r	   r
   jit.cubin_loaderr   r   utilsr   r   r   r   r   r   fmha_v2.generate_kernelsr   dtypeintboolrx   r*   r1   r>   r_   re   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r)  r;  r  rB  rG  rD  r   r+   r)   <module>ra     s     
			                            9 8 8 8 8 8 8 8              . - - - - - 7 7 7 7 7 7[k [ 	
     	   ,[k [ {	
      	   0..[. k. [	.
 {. . . . 	. . . .*]][] k] [	]
 {] ] ] ] ] ] ] ]@[k [ {	
    
 	   ,U+[U+kU+ [U+ {	U+
 U+ U+ U+ U+ U+ U+ U+ U+ U+p[ k [	
        	   2[k [ 	
       {     	   <[ k [	
 {        	   6[ k [	
 {     	   ,[k [ {	
      	   0)[)k) [) 	)
 ) ) ) ) ) ) ) )XWW[W kW [	W
 W W W W W  W W W W Wt>[>k> [> 	>
 > >  >  > {> > >  > > > > >B>[>k> [> 	>
 > >  >  > {> > >  > > > > >^  !!&"' !&"'"')a+ a+	a+[a+ ka+ [	a+
 {a+ a+ "#Ya+ #3ia+ "#Ya+ #3ia+ a+ a+ a+ a+ a+   !a+" #a+$ %a+&  'a+(  )a+* +a+ a+ a+ a+d  !!&"' !&"'"')a+ a+	a+[a+ ka+ [	a+
 {a+ a+ "#Ya+ #3ia+ "#Ya+ #3ia+ a+ a+ a+ a+ a+   !a+" #a+$ %a+&  'a+(  )a+* +a+ a+ a+ a+H,[,k, [, {	,
 , , , , , , , , ,^zz[z kz [	z
 {z z z z z z  z z z z zz--[- k- [	-
 {- - - - - - - - -`/[/k/ [/ {	/
 / / / / / / / /~ $ %S+ S+	S+[S+ kS+ [	S+
 S+ S+ "#YS+ #3iS+ "#YS+ #3iS+ S+ S+ S+ S+ S+  !S+ S+ S+ S+H $ %"'%a8 a8a8	a8 [a8 k	a8
 [a8 a8 a8 "#Ya8 #3ia8 "#Ya8 #3ia8 a8 a8 a8 a8  !a8"  #a8$ %a8& 'a8 a8 a8 a8d $ %!M+ M+	M+[M+ kM+ [	M+
 KM+ M+ M+ "#YM+ #3iM+ "#YM+ #3iM+ M+ M+ M+ M+  !M+" #M+ M+ M+ M+~ $ %"''{8 {8{8	{8 [{8 k	{8
 [{8 K{8 {8 {8 "#Y{8 #3i{8 "#Y{8 #3i{8 {8 {8 {8  !{8" #{8$  %{8& '{8( ){8 {8 {8 {8|![!k! [! {	!
 ! ! ! ! ! 	! ! ! !4$[$k$ [$ {	$
 $ $ $ $ $ $ $ $ $N! ! !d  %!X X	X[X kX [	X
 KX X X "#YX #3iX "#YX #3iX X X X X  !X X X Xv	 	 	  
!7 ! ! ! ! ! !r+   