
    )`ib6             0          d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZmZmZ ddlZddlmZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ dd	lmZ dd
l m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d Z=ej>        	 ddej?        dej?        dej?        dej?        de@de@de@deAdeAdejB        deAfd            ZCd ZDeD	 	 	 	 	 ddeEdeEdej?        dej?        dej?        dej?        de@de@de	eE         de	eE         d e	eE         d!e	eE         d"eEd#eEde@deAdeAdeAd$eAf&d%            ZFej>        d&             ZGej>        d'             ZHej>        d(             ZIej>        d)eEd*efd+            ZJed,e&jK        jL        d-dd.d*ed/ejM        d0ejM        d1ejM        d2eEd3e@d4e@d5eAd6eejM        eejM        ejM        f         f         fd7            ZNe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd/ejM        d0ejM        d1ejM        d:eejM                 d;eejM                 d<eejM                 d=eej?                 d>eejM                 d?eejM                 d@eAd2eEdeEdeAdAeeO         d4e@dBeeO         dCeeO         dDeeO         deEd5e
d         d6ejM        f*dE            ZPe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd/ejM        d0ejM        d1ejM        d:eejM                 d;eejM                 d<eejM                 d=eej?                 d>eejM                 d?eejM                 d@eAd2eEdeEdeAdAeeO         d4e@dBeeO         dCeeO         dDeeO         deEd5e
dF         d6eejM        ejM        f         f*dG            ZPe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd/ejM        d0ejM        d1ejM        d:eejM                 d;eejM                 d<eejM                 d=eej?                 d>eejM                 d?eejM                 d@eAd2eEdeEdeAdAeeO         d4e@dBeeO         dCeeO         dDeeO         deEd5eAd6eejM        eejM        ejM        f         f         f*dH            ZP ejQ        ePdFI          ZRdJejM        dKejM        dLejM        dMe@d6ejM        f
dNZS G dO dP          ZTdJejM        dQejM        d6ejM        fdRZU G dS dT          ZVdUejM        dVejM        dWe@d@eAfdXZWe	 	 	 	 	 	 	 	 	 	 	 dd/ejM        d0ejM        d1ejM        dUejM        dVejM        dYee	ejM                          dZee@         d[eejM                 d\eejM                 d@eAdAeeO         d]eeO         d^eeO         d_eeO         d`eeO         d5e
d         d6ejM        f"da            ZXe	 	 	 	 	 	 	 	 	 	 	 dd/ejM        d0ejM        d1ejM        dUejM        dVejM        dYee	ejM                          dZee@         d[eejM                 d\eejM                 d@eAdAeeO         d]eeO         d^eeO         d_eeO         d`eeO         d5e
dF         d6eejM        ejM        f         f"db            ZX	 	 	 	 	 	 	 	 	 	 	 dd/ejM        d0ejM        d1ejM        dUejM        dVejM        dYee	ejM                          dZee@         d[eejM                 d\eejM                 d@eAdAeeO         d]eeO         d^eeO         d_eeO         d`eeO         d5eAd6eejM        eejM        ejM        f         f         f"dcZXej>        dd             ZYe	 	 	 ddeejM        dfejM        dgejM        dhejM        diejM        dje@dke@dleeOejM        f         dmeeOejM        f         dneOdoe@d4e@dpejM        dqejM        dreAdseAd5eAdteejM                 d[eejM                 d\eejM                 d6eejM        eejM        ejM        f         f         f*du            ZZe	 	 	 	 	 	 	 	 ddeejM        dweejM        eejM        ejM        f         f         dhejM        dxejM        diejM        dje@dke@dleeOejM        f         dmeeOejM        f         doe@dpejM        dqejM        d4e@d[eeejM        e%f                  dyeeej?        eEf                  dneeO         dzee@         d2eEdreeA         d{ee	ejM                          d6eejM        e%f         f*d|            Z[e	 	 	 	 ddeejM        dfejM        dgejM        d[ejM        d}e@d~e@de@deOdeeO         deeO         d5eAd\eejM                 d6eejM        eejM        ejM        f         f         fd            Z\dS )a3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)SimpleNamespace)AnyDictListLiteralOptionalTupleUnionoverload   )flashinfer_api)	gen_batch_prefill_module"gen_customize_batch_prefill_modulegen_fmha_cutlass_sm100a_modulegen_single_prefill_moduleget_batch_prefill_uriget_single_prefill_urisetup_cubin_loadergen_trtllm_gen_fmha_moduleget_trtllm_fmha_v2_module)!cudnn_batch_prefill_with_kv_cache)get_seq_lens)packbitssegment_packbits)log2e	FP4TensorMaskModePosEncodingModeTensorLayout_check_cached_qkv_data_type_check_kv_layout_check_pos_encoding_modecheck_shape_dtype_device_get_cache_alibi_slopes_buf_get_cache_buf_unpack_paged_kv_cachecanonicalize_torch_dtypedetermine_attention_backenddevice_support_pdlget_device_sm_count	is_float8is_sm100a_supportedis_sm110a_supportedis_sm120a_supportedis_sm121a_supportedregister_custom_opregister_fake_opceil_divround_upc                 h    | dS t          | t          j                  r| dfS dt          |           fS )aZ  Split scale parameter into tensor and scalar components.

    Args:
        scale: Can be a torch.Tensor (per-head), a scalar float, or None.

    Returns:
        tuple: (tensor_ptr, scalar_val) where:
            - If scale is tensor: (scale, 1.0)
            - If scale is scalar: (None, scale)
            - If scale is None: (None, 1.0)
    N)N      ?r5   )
isinstancetorchTensorfloat)scales    f/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/prefill.py_split_scale_paramr<   E   s=     }y	E5<	(	( "czU5\\!!    Fdtype_qdtype_kvdtype_o	dtype_idxhead_dim_qkhead_dim_vopos_encoding_modeuse_sliding_windowuse_logits_soft_capdeviceuse_fp16_qk_reductionc                     t          |	          s-t          |	          st          |	          st          |	          r)t	          | ||||||||	  	                                        S t          d          )Nz&SM100A is not supported on this device)r,   r-   r.   r/   r   build_and_load
ValueError)r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   s              r;   get_fmha_modulerL   Y   s     	F##Cv&&C v&&C v&&	C .

 

 .


	 ABBBr=   c                 v     t           j         fd            t          j                   fd            }|S )z
    Decorator that converts unhashable arguments (like lists) to hashable ones (tuples)
    before applying functools.cache.
    c                       | i |S N )argskwargsfuncs     r;   cached_wrapperz+make_hashable_cache.<locals>.cached_wrapper   s    tT$V$$$r=   c                  N   g }| D ]O}t          |t                    r#|                    t          |                     :|                    |           Pi }|                                D ]2\  }}t          |t                    rt          |          ||<   -|||<   3 |i |S rO   )r6   listappendtupleitems)rQ   rR   hashable_argsarghashable_kwargskeyvaluerT   s          r;   wrapperz$make_hashable_cache.<locals>.wrapper   s      	* 	*C#t$$ *$$U3ZZ0000$$S)))) ,,.. 	- 	-JC%&& -',U||$$',$$~}@@@@r=   )	functoolscachewraps)rS   r_   rT   s   ` @r;   make_hashable_cacherc   |   sg     _% % % % _% _TA A A A A$ Nr=   backenduriidtypeadditional_tensor_namesadditional_tensor_dtypesadditional_scalar_namesadditional_scalar_dtypesvariant_namevariant_declfp8_enabledc                 h    t          | |||||||||	|
||||||||                                          S rO   )r   rJ   )rd   re   r>   r?   r@   rf   rB   rC   rg   rh   ri   rj   rk   rl   rD   rE   rF   rH   rm   s                      r;   "get_customize_batch_prefill_modulero      s\    , .  ' ( n)r=   c            '         t                      } |                                 t          |                                            	 	 	 ddt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          d	t          d
t          t          t          j        f         dt          t          t          j        f         dt          dt          j        dt          j        dt          dt          dt          dt          t          j                 dt          t          j                 dt          j        f&fd}d }d }t          |||          S )Nqueryk_cachev_cacheworkspace_bufferblock_tablesseq_lens	max_q_len
max_kv_len
bmm1_scale
bmm2_scale
batch_sizecum_seq_lens_qcum_seq_lens_kv
enable_pdlworkspace_sizewindow_leftoutsinksreturnc                    t          | j                  }|t          j        |           }t	          |t          j                  r!|j        t          j        k    sJ |t          z  }t	          |	t          j                  r|	j        t          j        k    sJ 	                    |d | |||||||||	ddd|
|||||||           |S )Nrq   r   )
r*   rG   r7   
empty_liker6   r8   dtypefloat32r   trtllm_paged_attention_context)rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   sm_countops                      r;   
_paged_runz1get_trtllm_gen_prefill_module.<locals>._paged_run   s    ( 'u|44;"5))Cj%,// 	,#u}4444#e+Jj%,// 	5#u}4444
))/	
 	
 	
2 
r=   c                       t          d          )Nz>Variable length is not implemented for trtllm-gen backend yet.)NotImplementedErrorrQ   rR   s     r;   _ragged_runz2get_trtllm_gen_prefill_module.<locals>._ragged_run  s     "L
 
 	
r=   c                      d S rO   rP   r   s     r;   _planz,get_trtllm_gen_prefill_module.<locals>._plan  s    r=   )	paged_run
ragged_runplan)rq   NN)r   rJ   r   get_library_pathr7   r8   intr
   r9   boolr   r   )modr   r   r   r   s       @r;   get_trtllm_gen_prefill_moduler      s   
$
&
&C					Bs++--...$ &*(,%5 5|55 5  ,	5
 l5 ,5 5 5 %-.5 %-.5 5 5 5 5 5  !5" el##5$ %%5& 
'5 5 5 5 5 5n
 
 
      r=   c           '      d    t           g|R  }t           g|R                                  }|j        t	          d| dd          dt
          j        dt
          j        dt
          j        dt
          j        d	t
          j        d
t          t
          j                 dt          dt          dt          dt          t
          j                 dt          t
          j                 dt          dt          dt          t
          j                 dt          t
          j                 dt          t
          j                 dt          dt          dd f& fd            }t          d| d          dt
          j        dt
          j        dt
          j        dt
          j        d	t
          j        d
t          t
          j                 dt          dt          dt          dt          t
          j                 dt          t
          j                 dt          dt          dt          dt          dd f d            }t          |          S )Nflashinfer::_run)tmpo	maybe_lsemutates_argsqkvr   r   r   	mask_modelayoutr   maybe_packed_custom_maskmaybe_alibi_slopeslogits_soft_capsm_scalescale_qscale_kscale_v
rope_scale
rope_thetar   c                 B   dk    rxt          |          \  }}t          |           s | ||||||||||||           n^t          |          \  }}t          |          \  }} | |||||||||||||||           n | |||||||||	|
||d|z  d|z             |S )Nfa3r5   )r<   r+   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   scale_v_tensorscale_v_scalarscale_q_tensorscale_q_scalarscale_k_tensorscale_k_scalarrd   run_funcs                           r;   run_single_prefillz5get_single_prefill_module.<locals>.run_single_prefill  s4   . e-?-H-H*NNQ<< %"#"   " 2DG1L1L.1CG1L1L.""""""!   & H("j j   " r=   c                     d S rO   rP   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                  r;   _fake_run_single_prefillz;get_single_prefill_module.<locals>._fake_run_single_prefillr  s	    $ 	r=   )run)r   r   rJ   r   r0   r7   r8   r   r   r9   r1   r   )rd   rQ   re   moduler   r   r   s   `     @r;   get_single_prefill_moduler     s   
 
04
0
0
0C&w6666EEGGFzH  s   /H  N<N<N <N \	N
 <N EL)N N N N #+5<"8N %U\2N N N %,'N %,'N  %,'!N" #N$ %N& 
'N N N N N N N` .S...//<< < \	
 < EL)    #+5<"8 %U\2      
!   0/( 12222r=   c           X         	
  dk    r&d}t                      }|j        }|j        
|j        	nAt	           g|R  }t           g|R                                  }|j        }|j        
|j        	t          d| dd          	 	 	 d<dt          j	        dt          j	        d	t          t                   d
t          j	        dt          j	        dt          j	        dt          j	        dt          j	        dt          j	        dt          t          j	                 dt          dt          dt          dt          dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          dt          dt          dt          dt          d t          t          j	                 d!t          t          j	                 d"t          t          j	                 d#d f: 
fd$            }t          d| d          dt          j	        dt          j	        d	t          t                   d
t          j	        dt          j	        dt          j	        dt          j	        dt          j	        dt          j	        dt          t          j	                 dt          dt          dt          dt          dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          dt          dt          dt          dt          d#d f4d%            }t          d| d&d'          	 	 	 	 	 	 	 	 	 	 	 d=dt          j	        dt          j	        d	t          t                   d
t          j	        d(t          j	        d)t          j	        dt          j	        d*t          j	        d+t          j	        d,t          j	        dt          j	        dt          t          j	                 dt          dt          dt          dt          dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          dt          d t          t          j	                 d!t          t          j	                 d"t          t          j	                 dt          dt          dt          d-t          d.t          t                   d/t          t                   d0t          t          j	                 d1t          t          j	                 d2t          t                   d3t          t                   d4t          t                   d5t          t                   d6t          t          j	                 d7t          t          j	                 d8t          t          j	                 d#d fV 	fd9            }t          d| d&          	 	 	 	 	 	 	 	 	 	 d>dt          j	        dt          j	        d	t          t                   d
t          j	        d(t          j	        d)t          j	        dt          j	        d*t          j	        d+t          j	        d,t          j	        dt          j	        dt          t          j	                 dt          dt          dt          dt          dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          t          j	                 dt          dt          dt          dt          dt          d-t          d.t          t                   d/t          t                   d0t          t          j	                 d1t          t          j	                 d2t          t                   d3t          t                   d4t          t                   d5t          t                   d6t          t          j	                 d7t          t          j	                 d#d fNd:            }t!          |||;          S )?N
trtllm-gentrtllm_gen_contextr   r   float_workspace_bufferint_workspace_bufferr   r   r   r   r   plan_info_vecr   r   r   	qo_indptr	kv_indptrr   r   r   r   r   r   maybe_custom_maskmaybe_mask_indptrr   maybe_prefix_len_ptrmaybe_token_pos_in_items_ptrmaybe_max_item_len_ptrr   r   r   r   token_pos_in_items_lenr   r   r   r   c                    |d u}#dk    r* $| |||||||||	|
|||||||||||d|z  d|z  |           n|rVt          |          \  }}t          |          \  }} t          |          \  }!}" $| |||||||||	|
||||||!||| |"           n2t          |          \  }!}" $| |||||||||	|
|||||||!|||"|           |S )Nfa2r5   )r<   )%r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   is_fp8r   r   r   r   r   r   rd   ragged_run_funcs%                                      r;   r   z,get_batch_prefill_module.<locals>.ragged_run  s   P $eO&$!!"$,&j j &3   6  7	-?-H-H*NN-?-H-H*NN-?-H-H*NNO&$+   4 .@-H-H*NNO&$$,&&-  2 r=   c                     d S rO   rP   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s                            r;   _fake_ragged_runz2get_batch_prefill_module.<locals>._fake_ragged_run  s	    8 	r=   r   r   r   paged_k_cachepaged_v_cacher   r   r   r   paged_kv_indptrpaged_kv_indicespaged_kv_last_page_lenr   num_qo_headsnum_kv_headsrv   kv_lens_buffer	page_sizerx   ry   r|   r}   r~   r   c*                    0dk    rl|J |J | J |!J |"J |#J |%J |&J |'J |(J |J |dk    s
J d             1|                                 ||||!|"|$|%|d|&|'|(||||
|)          }
n̉0dk    r=t          |          rJ  1| |||||||||	|
|||||||||||||d|z  d|z  |           nt          |          \  }*}+t          |          s# 1| |||||||||	|
|||||||||*|||+|           nEt          |          \  },}-t          |          \  }.}/ 1| |||||||||	|
||||||,|.|*||-|/|+           |
S )Nr   r   z%workspace_size must be greater than 0r5   )r   r   r   )
contiguousr+   r<   )2r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   rx   ry   r|   r}   r~   r   r   r   r   r   r   r   rd   paged_run_funcs2                                                   r;   r   z+get_batch_prefill_module.<locals>.paged_run<  s   n l""$$$+++++++++!---((())))))!---"...)))!A%%%'N%%%$%  AA(  ||###N&$ &!!"$,&j j &7   < .@-H-H*NNQ<< 6*(!!!#$*(0*"#"*1   6 2DG1L1L.1CG1L1L.*(!!!#$*""""""/  2 r=   c&                     d S rO   rP   )&r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   rx   ry   r|   r}   r~   s&                                         r;   _fake_paged_runz1get_batch_prefill_module.<locals>._fake_paged_run  s
    R 	r=   r   r   r   NNN)NNNNNNNNNNN)
NNNNNNNNNN)r   r   r   r   r   r   rJ   r0   r7   r8   r   r   r   r   r9   r1   r   )rd   rQ   re   r   	plan_funcr   r   r   r   r   r   s   `        @@r;   get_batch_prefill_moduler     s	   ,".00K	 +)#G3d333)'9D999HHJJK	 +) 's'''
  F +/*.*.9v v %v#lv Cyv <	v
 <v <v <v <v <v EL)v v v v v $EL1v  $EL1!v" %U\2#v$ 'u|4%v& '/u|&<'v( !) 6)v* +v, -v. /v0 1v2 !$3v4 %,'5v6 %,'7v8 %,'9v: 
;v v v v v v vp 5S55566 %#l Cy <	
 < < < < < EL)     $EL1  $EL1!" %U\2#$ 'u|4%& '/u|&<'( !) 6)* +, -. /0 12 !$34 
5   76> &s&&&

 
 
V '+&*/315#'#'$($(1526(,Ue e %e#le Cye <	e
 |e |e <e e  ,e !&e <e EL)e e e e  !e" $EL1#e$ $EL1%e& %U\2'e( 'u|4)e* '/u|&<+e, !) 6-e. /e0 1e2 %,'3e4 %,'5e6 %,'7e8 9e: ;e< !$=e> ?e@ smAeB smCeD u|,EeF !.GeH C=IeJ C=KeL SMMeN SMOeP !.QeR "%,/SeT %UeV 
We e e e e e
 
eN 4S44455< '+&*/315#'#'$($(1526M( ( %(#l( Cy( <	(
 |( |( <( (  ,( !&( <( EL)( ( ( (  !(" $EL1#($ $EL1%(& %U\2'(( 'u|4)(* '/u|&<+(, !) 6-(. /(0 1(2 3(4 5(6 !$7(8 9(: sm;(< sm=(> u|,?(@ !.A(B C=C(D C=E(F SMG(H SMI(J !.K(L "%,/M(N 
O( ( ( 65(\    r=   module_name
jit_modulec           !         |j         }|j        |j        t          d|  dd          dt          j        dt          j        dt          t                   dt          j        d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dd ffd            }t          d|  d          dt          j        dt          j        dt          t                   dt          j        d	t          j        d
t          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dd fd            }t          d|  dd          dt          j        dt          j        dt          t                   dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dd f fd            }t          d|  d          dt          j        dt          j        dt          t                   dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          j        dt          t          j                 dt          dt          dt          dd f d            }t          |||          S )Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                 2     | |||||||||	|
||g|R   d S rO   rP   )r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   r   s                 r;   r   z0get_batch_prefill_jit_module.<locals>.ragged_run+  sY    2 	" 	
 	
 	
 	
 	
 	
 	
r=   c                     d S rO   rP   )r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   s                 r;   r   z6get_batch_prefill_jit_module.<locals>._fake_ragged_runU  s	    " 	r=   r   r   r   r   r   r   r   c                 6     | |||||||||	|
||||g|R   d S rO   rP   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   r   s                   r;   r   z/get_batch_prefill_jit_module.<locals>.paged_runi  s_    : 	" "	
  !	
 	
 	
 	
 	
 	
r=   c                     d S rO   rP   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   s                   r;   r   z5get_batch_prefill_jit_module.<locals>._fake_paged_run  s	    & 	r=   r   )r   r   r   r0   r7   r8   r   r   r   r1   r   )	r   r   r   r   r   r   r   r   r   s	          @@r;   get_batch_prefill_jit_moduler   $  s   I +O)N /{///
  
 %
#l
 Cy
 <	

 <
 <
 <
 <
 <
 EL)
 
 
 
 

 
 
 
 
 
B =[===>> %#l Cy <	
 < < < < < EL)    
   ?>& .{...

 
 
#
 %#
#l#
 Cy#
 <	#

 |#
 |#
 <#
 #
  ,#
 !&#
 <#
 EL)#
 #
 #
 #
" 
##
 #
 #
 #
 #

 
#
J <[<<<== %#l Cy <	
 | | <   , !& < EL)   " 
#   >=0    r=   NHDrq   )	kv_layoutr   r   
return_lser   r   r   r   r   r   r   r   c                   |j         }	t          dd|	          }
t          j        |j        d d         |j        dd          z   |j        |	          }d }|rIt          j        |                    d          |                    d          ft          j        |	          } | j        ||||
|||t          |         j
        |g	|R   |r||fn|S )N single_prefill_with_kv_cache_tmp   )rG   rq   r   rG   r   r   )rG   r%   r7   emptyshaper   sizer   r   r   r^   )r   r   r   r   r   r   r   r   rQ   rG   r   r   lses                r;   ,single_prefill_with_kv_cache_with_jit_moduler     s     XF
*,<V  C 	AGCRCL17233</qwvNNNA
C Vk166!99affQii0fUUUJN				Y% 
    "(As88q(r=   NONEautor   r   r   o_dtypecustom_maskpacked_custom_maskcausalr   r   r   r   c                     d S rO   rP   r   r   r   r   r   r   r   r   r   r  r   rD   rH   r   r   r   r   r   rd   r   s                       r;   single_prefill_with_kv_cacher    s	    , 3r=   Tc                     d S rO   rP   r  s                       r;   r  r    s	    , ),r=   c                    t          |           t          |
           t          dd| j                  }|d}|*dt	          j        |                     d                    z  }|d}|d}|8|6t          |                                	                    d          d	          }|t          j        j        }n%|	rt          j        j        }nt          j        j        }d}|rNt          j        |                     d
          |                     d          ft          j        | j                  }t%          |           r|dk    sJ | j        |j        cxk    r|j        k    sn J | j        d         |j        d         cxk    r|j        d         k    sn J |1t          j        | j        d         t          j        | j                  }|1t          j        |j        d         t          j        | j                  }|1t          j        |j        d         t          j        | j                  }|dk    r5t-          | j        t.          |         j        ||du| j        |j                  }|| j        }t          j        | j        dd         |j        dd         z   || j                  }t1          || j        |j        |j        | j        d         |j        d         t.          |         j        |d
k    |d
k    |
  
        }|                    | ||||||t4          |
         j        ||t7          | j        d         | j                  |||||||           |r||fn|S )a  Prefill/Append attention with KV cache for single request, return the attention
    output.

    Parameters
    ----------
    q : torch.Tensor
        The query tensor, shape: ``[qo_len, num_qo_heads, head_dim_qk]``.
    k : torch.Tensor
        The key tensor, shape: ``[kv_len, num_kv_heads, head_dim_qk]`` if :attr:`kv_layout`
        is ``NHD``, or ``[num_kv_heads, kv_len, head_dim_qk]`` if :attr:`kv_layout` is
        ``HND``.
    v : torch.Tensor
        The key tensor, shape: ``[kv_len, num_kv_heads, head_dim_vo]`` if :attr:`kv_layout`
        is ``NHD``, ``[num_kv_heads, kv_len, head_dim_vo]`` if :attr:`kv_layout` is
        ``HND``.
    scale_q : Optional[torch.Tensor]
        The scale tensor for query, per-head quantization with shape: ``[num_qo_heads]``.
        Used with FP8 Quantization. If not provided, will be set to ``1.0``.
    scale_k : Optional[torch.Tensor]
        The scale tensor for key, per-head quantization with shape: ``[num_kv_heads]``.
        Used with FP8 Quantization. If not provided, will be set to ``1.0``.
    scale_v : Optional[torch.Tensor]
        The scale tensor for value, per-head quantization with shape: ``[num_kv_heads]``.
        Used with FP8 Quantization. If not provided, will be set to ``1.0``.
    o_dtype : Optional[torch.dtype]
        The output tensor data type, if not provided, will be set to the same as the q.
        This is necessary as output dtype cannot be automatically inferred in quant.
    custom_mask : Optional[torch.Tensor]
        The custom boolean mask tensor, shape: ``[qo_len, kv_len]``.
        The elements in the mask tensor should be either ``True`` or ``False``,
        where ``False`` means the corresponding element in the attention matrix will be
        masked out.

        When :attr:`custom_mask` is provided, and :attr:`packed_custom_mask` is not, the
        function will pack the custom mask tensor into a 1D packed mask tensor, which introduces
        additional overhead.
    packed_custom_mask : Optional[torch.Tensor]
        The 1D packed uint8 mask tensor, if provided, the :attr:`custom_mask` will be ignored.
        The packed mask tensor is generated by :func:`flashinfer.quantization.packbits`.
    causal : bool
        Whether to apply causal mask to the attention matrix.
        This is only effective when :attr:`custom_mask` is not provided.
    kv_layout : str
        The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
    pos_encoding_mode : str
        The position encoding applied inside attention kernels, could be
        ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
        Default is ``NONE``.
    use_fp16_qk_reduction : bool
        Whether to use f16 for qk reduction (faster at the cost of slight precision
        loss).
    window_left : int
        The left (inclusive) window size for the attention window, when set to ``-1``, the window
        size will be set to the full length of the sequence. Defaults to ``-1``.
    logits_soft_cap : Optional[float]
        The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
        provided, will be set to ``0``. If greater than 0, the logits will be capped according to
        formula:
        :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
        where :math:`x` is the input logits.
    sm_scale : Optional[float]
        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim_qk)``.
    rope_scale : Optional[float]
        The scale used in RoPE interpolation, if not provided, will be set to 1.0.
    rope_theta : Optional[float]
        The theta used in RoPE, if not provided, will be set to 1e4.
    backend : str
        The implementation backend, could be ``auto``/``fa2`` or ``fa3``. Defaults to ``auto``.
        If set to ``auto``, the function will automatically choose the backend based on the
        device architecture and kernel availability.
    return_lse : bool
        Whether to return the log sum exp value of the attention logits.

    Returns
    -------
    Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        If :attr:`return_lse` is ``False``, the attention output, shape: ``[qo_len, num_qo_heads, head_dim_vo]``.
        If :attr:`return_lse` is ``True``, a tuple of two tensors:

        * The attention output, shape: ``[qo_len, num_qo_heads, head_dim_vo]``.
        * The log sum exp value, shape: ``[qo_len, num_qo_heads]``.

    Examples
    --------

    >>> import torch
    >>> import flashinfer
    >>> qo_len = 128
    >>> kv_len = 4096
    >>> num_qo_heads = 32
    >>> num_kv_heads = 4
    >>> head_dim = 128
    >>> q = torch.randn(qo_len, num_qo_heads, head_dim).half().to("cuda:0")
    >>> k = torch.randn(kv_len, num_kv_heads, head_dim).half().to("cuda:0")
    >>> v = torch.randn(kv_len, num_kv_heads, head_dim).half().to("cuda:0")
    >>> o = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True,
            use_fp16_qk_reduction=True)
    >>> o.shape
    torch.Size([128, 32, 128])
    >>> mask = torch.tril(
    >>>     torch.full((qo_len, kv_len), True, device="cuda:0"),
    >>>     diagonal=(kv_len - qo_len),
    >>> )
    >>> mask
    tensor([[ True,  True,  True,  ..., False, False, False],
            [ True,  True,  True,  ..., False, False, False],
            [ True,  True,  True,  ..., False, False, False],
            ...,
            [ True,  True,  True,  ...,  True, False, False],
            [ True,  True,  True,  ...,  True,  True, False],
            [ True,  True,  True,  ...,  True,  True,  True]], device='cuda:0')
    >>> o_custom = flashinfer.single_prefill_with_kv_cache(q, k, v, custom_mask=mask)
    >>> torch.allclose(o, o_custom, rtol=1e-3, atol=1e-3)
    True

    Note
    ----
    The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads`` is
    not equal to ``num_kv_heads``, the function will use
    `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
    r   r   N        r5   rq        @littlebitorderr   r   r   r   )r"   r!   r%   rG   mathsqrtr   r   r   viewr   CUSTOMr^   CAUSAL
NON_CAUSALr7   r   r   r+   r   r   onesr(   r   r   r   r   r$   )r   r   r   r   r   r   r   r   r   r  r   rD   rH   r   r   r   r   r   rd   r   r   r   r   r   r   s                            r;   r  r    sq   ` .///Y
;=Mqx
X
XC166"::...

#5#=%""$$))"--
 
 
 %O)		 	2 -II +1I
C Xk166!99affQii0ahWWW|| S b    w!',,,,QW,,,,,,wr{agbk8888QWR[888888?j5=RRRG?j5=RRRG?j5=RRRG&-H-.4!d*GG
 
 '
+agcrclQWRSS\1
R
R
RC&					)*0q! F JJ			Y%#AGAJ99%  * $,C::,r=   r   r   r   r   r   c                 .   t          |           t          |          k    rt          d          t          j        |           }d|d<   t          j        | dd          | d d         z
  |dd          |d d         z
  dz
  |z  |z   z  d          |dd <   |S )Nz?The length of qo_indptr and paged_kv_indptr should be the same.r   r   rq   lenrK   r7   r   cumsum)r   r   r   r   mask_indptrs        r;   _compute_page_mask_indptrr    s     9~~_----M
 
 	
 "9--KKNl	1223B3	'QRR ?3B3#77!;yH$%	

 	
 KO r=   c            H       	   e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 dRdej        deded	e	ej                 d
e	ej                 de	ej                 de	ej                 de	ej                 de	ej                 dede	e
e                  de	eeef                  ddfd            Zedefd            Zdej        dej        ddfdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dSdej        dej        dej        dej        d ed!ed"ed#ed$e	e         d%e	ej                 d&e	ej                 d'ed(ed)ed*e	e         d+ed,e	e         d-e	e         d.e	e         d/eeej        f         d0e	eeej        f                  d1e	eeej        f                  d2ed3e	ej                 d4e	ej                 d5ed6e	ej                 d7e	ej                 d8e	ej                 d9e	ej                 d:e	e         d;e	e         d<e	e         d=eddfFd>            ZeZ	 	 	 	 	 	 	 	 	 	 dTd?ej        d@eej        eej        ej        f         f         d'ed(ed)edAe	e         dBe	e         d+ed,e	e         d*e	e         d-e	e         d.e	e         dej        fdCZeddddddddDd?ej        d@eej        eej        ej        f         f         dAe	e         dBe	e         dEe	ej                 dFe	ej                 dGed         dHe	e         d+e	e         dej        fdI            ZeddddddddDd?ej        d@eej        eej        ej        f         f         dAe	e         dBe	e         dEe	ej                 dFe	ej                 dGed         dHe	e         d+e	e         deej        ej        f         fdJ            ZeddddddddddK	d?ej        d@eej        eej        ej        f         f         dLe	eeej        f                  dAe	eeej        f                  dBe	eeej        f                  dEe	ej                 dFe	ej                 dGedHe	e         d+e	e         dMe	ej                 deej        eej        ej        f         f         fdN            Z ej        edO          Z	 	 	 	 	 	 	 	 	 	 dTd?ej        d@eej        eej        ej        f         f         d'ed(ed)edAe	e         dBe	e         d+ed,e	e         d*e	e         d-e	e         d.e	e         deej        ej        f         fdPZdUdQZ dS )V#BatchPrefillWithPagedKVCacheWrappera  Wrapper class for prefill/append attention with paged kv-cache for batch of
    requests.

    Check :ref:`our tutorial <kv-layout>` for page table layout.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 16
    >>> head_dim = 128
    >>> max_num_pages = 128
    >>> page_size = 16
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> nnz_qo = 100
    >>> qo_indptr = torch.tensor(
    ...     [0, 33, 44, 55, 66, 77, 88, nnz_qo], dtype=torch.int32, device="cuda:0"
    ... )
    >>> paged_kv_indices = torch.arange(max_num_pages).int().to("cuda:0")
    >>> paged_kv_indptr = torch.tensor(
    ...     [0, 17, 29, 44, 48, 66, 100, 128], dtype=torch.int32, device="cuda:0"
    ... )
    >>> # 1 <= paged_kv_last_page_len <= page_size
    >>> paged_kv_last_page_len = torch.tensor(
    ...     [1, 7, 14, 4, 3, 1, 16], dtype=torch.int32, device="cuda:0"
    ... )
    >>> q_at_layer = torch.randn(num_layers, nnz_qo, num_qo_heads, head_dim).half().to("cuda:0")
    >>> kv_cache_at_layer = torch.randn(
    ...     num_layers, max_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ... )
    >>> # create auxiliary data structures for batch prefill attention
    >>> prefill_wrapper.plan(
    ...     qo_indptr,
    ...     paged_kv_indptr,
    ...     paged_kv_indices,
    ...     paged_kv_last_page_len,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ...     causal=True,
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = q_at_layer[i]
    ...     kv_cache = kv_cache_at_layer[i]
    ...     # compute batch prefill attention, reuse auxiliary data structures
    ...     o = prefill_wrapper.run(q, kv_cache)
    ...     outputs.append(o)
    ...
    >>> outputs[0].shape
    torch.Size([100, 64, 128])
    >>>
    >>> # below is another example of creating custom mask for batch prefill attention
    >>> mask_arr = []
    >>> qo_len = (qo_indptr[1:] - qo_indptr[:-1]).cpu().tolist()
    >>> kv_len = (page_size * (paged_kv_indptr[1:] - paged_kv_indptr[:-1] - 1) + paged_kv_last_page_len).cpu().tolist()
    >>> for i in range(batch_size):
    ...     mask_i = torch.tril(
    ...         torch.full((qo_len[i], kv_len[i]), True, device="cuda:0"),
    ...         diagonal=(kv_len[i] - qo_len[i]),
    ...     )
    ...     mask_arr.append(mask_i.flatten())
    ...
    >>> mask = torch.cat(mask_arr, dim=0)
    >>> prefill_wrapper.plan(
    ...     qo_indptr,
    ...     paged_kv_indptr,
    ...     paged_kv_indices,
    ...     paged_kv_last_page_len,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ...     custom_mask=mask,
    ... )
    >>> for i in range(num_layers):
    ...     q = q_at_layer[i]
    ...     kv_cache = kv_cache_at_layer[i]
    ...     # compute batch prefill attention, reuse auxiliary data structures
    ...     o_custom = prefill_wrapper.run(q, kv_cache)
    ...     assert torch.allclose(o_custom, outputs[i], rtol=1e-3, atol=1e-3)
    ...



    Note
    ----
    To accelerate computation, FlashInfer's batch prefill/append attention operators
    create some auxiliary data structures, these data structures can be reused across
    multiple prefill/append attention calls (e.g. different Transformer layers). This
    wrapper class manages the lifecycle of these data structures.
    r   FNr   r   r   use_cuda_graphqo_indptr_bufpaged_kv_indptr_bufpaged_kv_indices_bufpaged_kv_last_page_len_bufcustom_mask_bufmask_indptr_bufrd   jit_args
jit_kwargsr   c                 0   t          |           |.|i }t          |d         t          |
g|R i |          | _        nd| _        || _        |
dk    r|dk    s
J d            || _        | j                                        | j                                        z  | _        |j	        | _	        t          j        dt          j        | j	                  | _        t          j        dt          j        | j	                  | _        t          j        | j        j        | j        j        d	d
          | _        || _        |rt          j        |          st+          d          t          j        |          st+          d          t          j        |          st+          d          t          j        |          st+          d          t-          |          dz
  | _        t-          |          | j        dz   k    rt+          d          t-          |          | j        k    rt+          d          nd| _        || _        || _        || _        || _        || _        |	| _        d| _        |
| _        d| _         d| _!        d| _"        d| _#        d| _$        dS )am  Constructor of :class:`BatchPrefillWithPagedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved workspace buffer used to store intermediate attention results in
            split-k algorithm. The recommended size is 128MB, the device of the workspace buffer
            should be the same as the device of the input tensors.

        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.

        use_cuda_graph : bool
            Whether to enable CUDA graph capture for the prefill kernels, if enabled, the
            auxiliary data structures will be stored in provided buffers. The ``batch_size``
            cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.

        qo_indptr_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``qo_indptr`` array, the size of the buffer
            should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.

        paged_kv_indptr_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``paged_kv_indptr`` array, the size of this
            buffer should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.

        paged_kv_indices_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``paged_kv_indices`` array, should be large
            enough to store the maximum possible size of the ``paged_kv_indices`` array during
            the lifetime of the wrapper. This argument is only effective when ``use_cuda_graph``
            is ``True``.

        paged_kv_last_page_len_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``paged_kv_last_page_len`` array, the size of
            the buffer should be ``[batch_size]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.

        custom_mask_buf : Optional[torch.Tensor]
            The user reserved buffer to store the custom mask tensor, should be large enough to
            store the maximum possible size of the packed custom mask tensor during the lifetime of
            the wrapper. This argument is only effective when ``use_cuda_graph`` is set to ``True``
            and the custom mask will be used in attention computation.

        mask_indptr_buf : Optional[torch.Tensor]
            The user reserved buffer to store the ``mask_indptr`` array, the size of the buffer
            should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True`` and the custom
            mask will be used in attention computation.

        backend : str
            The implementation backend, could be ``auto``/``fa2``/``fa3``/``cudnn`` or ``trtllm-gen``.
            Defaults to ``auto``.
            If set to ``auto``, the wrapper will automatically choose the backend based on the
            device architecture and kernel availability.

        jit_args : Optional[List[Any]]
            If provided, the wrapper will use the provided arguments to create the JIT module,
            otherwise, the wrapper will use default attention implementation.

        jit_kwargs : Optional[Dict[str, Any]]
            The keyword arguments to create the JIT module, defaults to None.
        Nr   cudnnr   z&CUDNN backend only supports NHD layout)i   r   i   cpuTr   rG   
pin_memoryz9qo_indptr_buf should be a torch.Tensor in CUDA graph modez?paged_kv_indptr_buf should be a torch.Tensor in CUDA graph modez@paged_kv_indices_buf should be a torch.Tensor in CUDA graph modezFpaged_kv_last_page_len_buf should be a torch.Tensor in CUDA graph moder   z;The length of paged_kv_indptr_buf should be batch_size + 1.z>The length of paged_kv_last_page_len_buf should be batch_size.)%r!   r   ro   _jit_module
_kv_layout_float_workspace_buffernumelelement_size_workspace_sizerG   r7   r   int32_kv_lens_bufferuint8_int_workspace_bufferr   r    _pin_memory_int_workspace_buffer_use_cuda_graph	is_tensorrK   r  _fixed_batch_size_qo_indptr_buf_paged_kv_indptr_buf_paged_kv_indices_buf_paged_kv_last_page_len_buf_custom_mask_buf_mask_indptr_buf_max_total_num_rows_backend
_plan_info_cached_module_seq_lens_kv_seq_lens_q_block_tables)selfr   r   r  r  r  r  r   r!  r"  rd   r#  r$  s                r;   __init__z,BatchPrefillWithPagedKVCacheWrapper.__init__  s   ^ 	###!
;27TXTTTTT   D
  $D#g%%%'O%%%'=$(..00*7799: 	 -3${EK 
  
  
 &+[ek$+&
 &
 &
" 16&,,2	1
 1
 1
-  . 	'?=11  O   ?#677  U   ?#788  V   ?#=>>  \   &)%7%7!%;D"&''4+AA+EEE Q   -..$2HHH T   I &'D"+$7!%9"+E( / /26 " !r=   c                     | j         S rO   r6  rF  s    r;   is_cuda_graph_enabledz9BatchPrefillWithPagedKVCacheWrapper.is_cuda_graph_enabled      ##r=   r   c                     || _         || _        t          j        | j        j        | j        j        dd          | _        dS a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer, the device of the new float workspace buffer should
            be the same as the device of the input tensors.

        int_workspace_buffer : torch.Tensor
            The new int workspace buffer, the device of the new int workspace buffer should
            be the same as the device of the input tensors.
        r(  Tr)  Nr-  r4  r7   r   r   r   r5  rF  r   r   s      r;   reset_workspace_bufferz:BatchPrefillWithPagedKVCacheWrapper.reset_workspace_buffer"  K     (>$%9"05&,,2	1
 1
 1
---r=   r   rq   float16Tr   r   r   r   r   r   r   rB   r   rC   r   r   r  rD   rH   r   r   r   r   r   q_data_typekv_data_typeo_data_typenon_blockingprefix_len_ptrtoken_pos_in_items_ptrr   max_item_len_ptrrw   
seq_lens_qrv   max_token_per_sequencemax_sequence_kvfixed_split_sizedisable_split_kvc#                 t   t          |          }||}t          |          }||}t          |          }|d}|	|}	|!d}!t          |          dz
  }#|#| _        || _        || _        |
|t          |||          }$|<|
:t          |
                                                    d          |$d          \  }}$|| _	        || _
        || _        || _        ||| _        nc|                    d          }%t          |%dd         |%dd         z
                                            | _        t#          |%d                   }&| | | _        n|                    d          }'|                    d          }(|t'          |'|(          })n&|                                                                })| j        dt          |)                                       |)|           t          |)                                          | _        | j        r| j        |&| _        n3|&| j        k    r(t5          d	                    |&| j                            |#| j        k    r(t5          d
                    |#| j                            t          |          t          | j                  k    rt5          d          | j                            ||           | j                            ||           | j                             ||           | j        dt          |                                       ||j!        | j!        k    o|           |tE          j#        | j$                  st5          d          tE          j#        | j%                  st5          d          | j$        dt          |                                       ||j!        | j!        k    o|           | j%                            |$|           n|                    | j!        |          | _        |                    | j!        |          | _        |                    | j!        |          | _        |                    | j!        |          | _         |C|                    | j!        |          | _$        |$                    | j!        |          | _%        nd| _$        d| _%        || _&        || _'        || _(        | j)        | j)        | _*        n| j+        dk    r5tY          | j!        tZ          |         j.        || j$        du||          | _+        | j+        dk    r@||||j/        ||	tZ          |         j.        |dk    |dk    |f
}*ta          | j+        g|*R  | _*        || _1        | j+        dk    r|ste          d          |dk    sJ | j1        fd|)D             }+t          |+          },tE          j3        |#|,ftD          j        | j!                  | _1        |'d         }-ti          |#          D ]9}.|+|.         }/| j1        
J d            ||-|-|/z            | j1        |.d|/f<   |-|/z  }-:| j*        | j5        | j6        | j7        |%|'|)| j        p|&|#||| j        ||	||g}0| j+        dk    rA|08                    |!pd           |08                    |"           |08                    d            | j*        j9        |0 | _:        || _;        || _<        || _=        || _>        || _?        || _@        || _A        || _B        || _C        ||n|| _D        dS )a  Plan batch prefill/append attention on Paged KV-Cache for given problem specification.

        Parameters
        ----------
        qo_indptr : torch.Tensor
            The indptr of the query/output tensor, shape: ``[batch_size + 1]``.
        paged_kv_indptr : torch.Tensor
            The indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
        paged_kv_indices : torch.Tensor
            The page indices of the paged kv-cache, shape: ``[paged_kv_indptr[-1]]``.
        paged_kv_last_page_len : torch.Tensor
            The number of entries in the last page of each request in the paged
            kv-cache, shape: ``[batch_size]``.
        num_qo_heads : int
            The number of query/output heads.
        num_kv_heads : int
            The number of key/value heads.
        head_dim_qk : int
            The dimension of the query/key heads.
        page_size : int
            The size of each page in the paged kv-cache.
        head_dim_vo : Optional[int]
            The dimension of the value/output heads, if not provided, will be set to
            ``head_dim_qk``.
        custom_mask : Optional[torch.Tensor]
            The flattened boolean mask tensor, shape: ``(sum(q_len[i] * k_len[i] for i in range(batch_size))``.
            The elements in the mask tensor should be either ``True`` or ``False``,
            where ``False`` means the corresponding element in the attention matrix will be
            masked out.

            Please refer to the :ref:`mask layout <mask-layout>` for more details about flattened
            layout of mask tensor.

            When :attr:`custom_mask` is provided, and :attr:`packed_custom_mask` is not, the
            function will pack the custom mask tensor into a 1D packed mask tensor, which introduces
            additional overhead.
        packed_custom_mask : Optional[torch.Tensor]
            The 1D packed uint8 mask tensor, if provided, the :attr:`custom_mask` will be ignored.
            The packed mask tensor is generated by :func:`flashinfer.quantization.packbits`.
        causal : bool
            Whether to apply causal mask to the attention matrix.
            This is only effective when :attr:`custom_mask` is not provided in
            :meth:`plan`.
        pos_encoding_mode : str
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Default is ``NONE``.
        use_fp16_qk_reduction : bool
            Whether to use f16 for qk reduction (faster at the cost of slight precision
            loss).
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        sm_scale : Optional[float]
            The scale used in softmax, if not provided, will be set to
            ``1.0 / sqrt(head_dim)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.
        q_data_type : Union[str, torch.dtype]
            The data type of the query tensor, defaults torch.float16.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
        o_data_type : Optional[Union[str, torch.dtype]]
            The data type of the output tensor. If None, will be set to :attr:`q_data_type`.
            For FP8 inputs, this should typically be set to torch.float16 or torch.bfloat16.
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.
        prefix_len_ptr :Optional[torch.Tensor]
            prefix length. A uint32 1D tensor indicating the prefix length of each prompt. The tensor size is equal to the batch size.
        token_pos_in_items_ptr : Optional[torch.Tensor]
            A uint16 1D tensor (it will be converted to uint16 in flashinfer) indicating the token position of each item and started from 0 (delimiter)
            for each item. E.g., if we have 3 items of length 3, 2, 4 respectively for this member. This vector will be looking like
            `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0]` with 4 delimiters indexed as 0. For batch size > 1,
            we will concat them as 1D with zero paddings to make sure each has the same length, the padding length is defined by
            `token_pos_in_items_len` - length of the raw `token_pos_in_items_ptr` for each prompt.
        token_pos_in_items_len : int
            zero padding length for `token_pos_in_items_ptr` to better handle the bsz > 1 case. Still using the above 3,2,4 example.
            If we set `token_pos_in_items_len` to be 20, it will be  `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0]`
            with 7 padded zeros. (note there're 8 zeros in the end where the first one is the delimiter token 0 in the end of the prompt)
        max_item_len_ptr : Optional[torch.Tensor]
            a uint16 vector contains the max token length of all items for each prompt
        seq_lens: Optional[torch.Tensor]
            A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``.
        seq_lens_q: Optional[torch.Tensor]
            A uint32 1D tensor indicating the q sequence length of each prompt. shape: ``[batch_size]``.
            If not provided, will be set to the same value as ``seq_lens``.
        block_tables: Optional[torch.Tensor]
            A uint32 2D tensor indicating the block table of each prompt. shape: ``[batch_size, max_num_blocks_per_seq]``.
        max_token_per_sequence: Optional[int],
            Required for cudnn backend. This is the scalar max token length of each sequence.
        max_sequence_kv: Optional[int],
            Required for cudnn backend. This is the scalar max sequence length of each sequence in kv cache.
        fixed_split_size : Optional[int],
            The fixed split size for FA2 split-kv prefill/decode in pages. Recommend setting to the average sequence length of your workload.
            When enabled, will lead to deterministic softmax score reduction in the merge_states kernel, and therefore
            batch-size invariant outputs. See https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/
            Note that compatibility with CUDA graph is NOT guaranteed, as even when bs is fixed, kv seq len can change
            and lead to a varied number of launched CTAs.
        disable_split_kv : bool,
            Whether to disable the split-kv for determinism in CUDA Graph, defaults to ``False``.
        Note
        ----
        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple kernel runs.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.

        The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
        Nr  rq   r   r	  r
  r(  rW  zThe total number of rows in qo_indptr {} in cuda graph mode cannot exceed the number of rows set during initialization {}.zThe batch size should be fixed during the lifecycle of the wrapper in cuda graph mode, the runtime batch size {} mismatches the batch size {}  set during initialization.zAThe length of paged_kv_indices exceeds the allocated buffer size.zcustom_mask_buf must be initialized with a torch.Tensor in cuda graph mode if we use custom mask in attention computation.zzmask_indptr_buf must be initialized with a torch.Tensor in cuda graph mode if we use custom mask in attention computation.r   r&  r   r   zNon-causal attention is not supported for trtllm-gen backend with paged KV cache. Please use causal=True or choose a different backend (e.g., fa2, fa3, cudnn).c                 &    g | ]}|z   d z
  z  S )r   rP   ).0seq_lenr   s     r;   
<listcomp>z<BatchPrefillWithPagedKVCacheWrapper.plan.<locals>.<listcomp>  s8     " " " y(1,:" " "r=   r   zblock_tables is not initializedr   )Er'   r  _batch_size_num_qo_heads_num_kv_headsr  r   r   r  _prefix_len_ptr_token_pos_in_items_ptr_token_pos_in_items_len_max_item_len_ptr
_max_q_lentomaxitemr   _max_kv_lenr   r(  flattenr2  copy_rK  r?  rK   formatr8  r;  r9  r:  r<  rG   r7   r7  r=  r>  _cached_q_data_type_cached_kv_data_type_cached_o_data_typer+  rB  r@  r(   r   r^   r   r   rE  r   zerosranger-  r4  r5  rW   r   rA  _causal_pos_encoding_mode_use_fp16_qk_reduction_window_left_logits_soft_cap	_sm_scale_rope_scale_rope_thetarC  rD  )1rF  r   r   r   r   r   r   rB   r   rC   r   r   r  rD   rH   r   r   r   r   r   rT  rU  rV  rW  rX  rY  r   rZ  rw   r[  rv   r\  r]  r^  r_  r|   r  qo_indptr_hosttotal_num_rowspaged_kv_indptr_hostpaged_kv_last_page_len_hostkv_lens_arr_hostget_module_argsblocks_per_seqmax_num_blocks_per_seqblock_idinum_blocks_neededrQ   s1           `                                        r;   r   z(BatchPrefillWithPagedKVCacheWrapper.plan:  s   ~ /{;;&L/==%K.{;;"!O%K#!^^a'
%))"&8&D3&	 K %+*A.>&&((--b11!/ / /+  .'=$'=$!1 "-4DOO&\\%00N!."4~crc7J"JKKPPRRDO !344N&.D#2#5#5e#<#< *@*C*CE*J*J'#/(*Ey$ $   $,<<>>#9#9#;#;  !83'7#8#8!89?? | @     ##34499;;D% F	-'/+9(($"::: NNTf&(@O O   T333 228&"D$:3 3   #$$s4+E'F'FFF W   %%il%KKK%++O,+WWW,22&\ 3    &'>-=)>)>'>?EE .5DV, F   
 "-t'<== $ U   t'<== $ U   %&?,>(?(?&?@FF&"4";t{"J "%$ G    %++Kl+SSS"+,,t{,"V"VD(7(:(:, ); ) )D% *:)<)<, *= * *D& 0F/H/H, 0I 0 0D, "-(:(=(=Kl )> ) )% )4Kl )7 ) )%% )-%(,%#. $0!#. '"&"2D}&& ;K#$56<))5 ! ! }'' #)#$56<1$#a')# '?M'$3' ' '# *=L(( )d   #c))))!)" " " "#3" " " *-^)<)<&%*[!78);& & &"
 02z** 2 2A(6q(9%-999 :99 AQ 8.?#??AD&q*<+<*<'<=  11HH*,*5$ (:N*!D$ }%%,2333,---A6d16DO "3&;#' /!%%$)3)?::Xr=   r   paged_kv_cachek_scalev_scalec                     || _         || _        || _        || _        |	| _        |
| _        || _        || _        |                     ||||          S )EWarning: This function is deprecated, please use :meth:`run` instead.r  r  	r|  r}  r~  r  r  r  r  r  r   rF  r   r  r  rD   rH   r  r  r   r   r   r   r   s                r;   forwardz+BatchPrefillWithPagedKVCacheWrapper.forward  s\      "3&;#' /!%%xx>7GxLLLr=   )r  r  r   r   r   r   r   r   r   r   r   c                    d S rO   rP   rF  r   r  r  r  r   r   r   r   r   rQ   s              r;   r   z'BatchPrefillWithPagedKVCacheWrapper.run  s	     sr=   c                    d S rO   rP   r  s              r;   r   z'BatchPrefillWithPagedKVCacheWrapper.run  s	     -0Cr=   )	q_scaler  r  r   r   r   r   r   r   r  r   c       	            |	t          |j                  }	t          || j                  \  }}t	          ||| j        | j                   | j        }|&|j        |k    rt          d|j         d| d          | j        dk    r|j
        d         }n|j
        d         }|
| j        n|
}
| j        dk    r|
| j        k    sJ | j        }| j        }| j        }| j        }|d	}|*d
t#          j        |                    d                    z  }| j        dk    r|||z  }|||z  }|d
}|d}|r|Ot)          j        |                    d          |                    d          ft(          j        |j                  }nJt/          ||                    d          |                    d          ft(          j        |j        d           |Rt1          | dd          p|j        }t)          j        |j
        dd         |j
        dd         z   ||j                  }nMt1          | dd          p|j        }t/          ||j
        dd         |j
        dd         z   ||j        d           | j        dk    r7| j        dk    r,|                    dd          }|                    dd          }| j        t8          j        j        }n*| j        rt8          j         j        }nt8          j!        j        }| j"        t8          j#        j        }| j        dk    r| j$        D| j$        %                                dk    r'| j$        &                    | j'        ddd          | _$        | j(        D| j(        %                                dk    r'| j(        &                    | j'        ddd          | _(        tS          |||| j        | j*        f| j$        | j(        | j+        | j,        | j-        | j        ||||| j.        | j.        |||d n| j        dk    r| j/        
J d            | j*        | j0        | j/        |||| j.        | j1        | j2        | j3        |||th          | j                 j        |
|	g}| j5        #|6                    to          |                     nd}d}d}tq          |          r+ts          |          dk    r|d         }|d         }|d         }|| j        | j:        tw          |j
        d         |j                  | j"        | j<        | j=        |||||||| j>        | j?        | j@        | jA        | j-        | jB        || j+        | j,        | j'        | j.        | j1        |gz  }| jC        
J d             | jC        jD        |  t          |t                    o|d
k    }|Q|sOtq          |          r;|G                    t(          j                  |z  G                    |j                  }n||z  }|r||fn|S )a
  Compute batch prefill/append attention between query and paged kv-cache.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor, shape: ``[qo_indptr[-1], num_qo_heads, head_dim]``
        paged_kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            The paged KV-Cache stored as a tuple of tensors or a single tensor:

            * a tuple ``(k_cache, v_cache)`` of 4-D tensors, each with shape:
              ``[max_num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
              and ``[max_num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.

            * a single 5-D tensor with shape:
              ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
              :attr:`kv_layout` is ``NHD``, and
              ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
              ``paged_kv_cache[:, 1]`` is the value-cache.

        *args
            Additional arguments for custom kernels.
        q_scale : Optional[Union[float, torch.Tensor]]
            The calibration scale of query for fp8 input, if not provided, will be set to ``1.0``.
        k_scale : Optional[Union[float, torch.Tensor]]
            The calibration scale of key for fp8 input, if not provided, will be set to ``1.0``.
        v_scale : Optional[Union[float, torch.Tensor]]
            The calibration scale of value for fp8 input, if not provided, will be set to ``1.0``.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool
            Whether to return the logsumexp of attention output
        enable_pdl : bool
            Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
            Only supported for >= sm90, and currently only for FA2 and CUDA core decode.
        Returns
        -------
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            If :attr:`return_lse` is ``False``, the attention output, shape: ``[qo_indptr[-1], num_qo_heads, head_dim]``.
            If :attr:`return_lse` is ``True``, a tuple of two tensors:

            * The attention output, shape: ``[qo_indptr[-1], num_qo_heads, head_dim]``.
            * The logsumexp of attention output, shape: ``[qo_indptr[-1], num_qo_heads]``.
        NzThe dtype of out z  does not match the o_data_type z specified in plan function.r   r      r   r  r5   rq   r&  r  r   r   r   ry  r   )actual_seq_lens_qactual_seq_lens_kvr\  r]  rv   r  r   r  r  r  batch_offsets_qbatch_offsets_or   r   rV  zplan info is not initialized    cached module is not initialized)Hr)   rG   r&   r,  r    rw  rx  ry  r   rK   r   r  r@  r  r  r  r  r  r  r   r7   r   r   r#   getattrrz  	transposer=  r   r  r^   r|  r  r  rk  MULTIITEMSCORINGrD  dimreshaperh  rC  r   r-  ro  rs  rE  r9  rA  r4  r:  r;  r<  r   r+  extendrV   r+   r  r>  r$   rl  rn  rm  r0  ri  rj  r2  rB  r   r6   r9   rp  )rF  r   r  r  r  r  r   r   r   r   r   r   rQ   rs   rt   r   r   r   r   r   r   	out_dtyper   run_argsfp8_scale_qfp8_scale_kfp8_scale_vis_float_ones                               r;   r   z'BatchPrefillWithPagedKVCacheWrapper.run   s   | +AH55J1.$/RR#w0$2K	
 	
 	
 *?syG33tCIttwttt   ?e##a(IIa(I+6+>d''K=L(( $"33333/>%
%
"!OTYqvvbzz222H=G##"G#"G#JJ 	{kVVAYYq		*%-   )!&&))QVVAYY/%   ;&;TBBMagI+w}RSS1118  CC  &;TBBMagI$QWSbS\GM"##$66	18U  
 =L((T_-E-E''B//G''B//G , -II| 6$O1		$/5	+ 17I=G##+0@0D0D0F0F!0K0K#'#3#;#;D<LaQRTU#V#V  ,1B1F1F1H1HA1M1M$($5$=$=d>NPQSTVW$X$X!-, #'"2#'#4'+ $ 0!/|% $ 3 $ 3%)    . },,224R222,*#)*0T_-3!H$ +T

++++ #""Q<< *CIINN"&q'K"&q'K"&q'K))/
AHEE(0*#0(&&&(O$$'-5 : &224V222)D)844%gu55H'S.L"<"S>> #66%-007:>>syIICC7NC'0SzzS0r=   r  c                     || _         || _        || _        || _        |	| _        |
| _        || _        || _        |                     ||||          S )PWarning: This function is deprecated, please use :meth:`run_return_lse` instead.r  	r|  r}  r~  r  r  r  r  r  run_return_lser  s                r;   forward_return_lsez6BatchPrefillWithPagedKVCacheWrapper.forward_return_lse  s_      "3&;#' /!%%""1ngw"WWWr=   c                     dS z7Warning: this function is deprecated and has no effect.NrP   rJ  s    r;   end_forwardz/BatchPrefillWithPagedKVCacheWrapper.end_forward	      r=   )r   FNNNNNNr   NN)NNNFr   FNrq   NNNrS  NNTNNr   NNNNNNNF)
Fr   FNNrq   NNNNr   N)!__name__
__module____qualname____doc__r   r7   r8   strr   r   r   r   r   rG  propertyrK  rQ  r   r9   r
   r   r   begin_forwardr	   r  r   r   r   r`   partialmethodr  r  r  rP   r=   r;   r  r    s	       c cJ  $046:7;=A2626(,/3\" \" %\" \" 	\"
  -\" &el3\" 'u|4\" %-U\$:\" "%,/\" "%,/\" \" 49%\" T#s(^,\" 
\" \" \" ^\"| $t $ $ $ X$
&+l
JO,
	
 
 
 
0  &*.259!'&+$(+/&*&*/8:>9=!159=&'37+/-1/304)-*.!&GIN IN<IN IN  ,	IN
 !&IN IN IN IN IN c]IN el+IN %U\2IN IN IN  $IN  5/!IN" #IN$ "%%IN& UO'IN( UO)IN* 3+,+IN, uS%+%567-IN. eC$456/IN0 1IN2 !.3IN4 !) 65IN6 !$7IN8 #5<09IN: 5<(;IN< U\*=IN> u|,?IN@ !)AINB "#CIND #3-EINF GINH 
IIN IN IN ^INV M !'&+#'#'+/$(&*&*M M<M elE%,2L,MMNM 	M
 M  $M %M %M M "%M 5/M UOM UOM 
M M M M4  $(#'&*&*%*%)%)  < elE%,2L,MMN
 % % el# el# EN TN c] 
   X  $(#'&*&*$(%)%)0 0 0<0 elE%,2L,MMN0
 %0 %0 el#0 el#0 DM0 TN0 c]0 
u|U\)	*0 0 0 X0  9=8<8<&*&* %)%)(,o1 o1 o1<o1 elE%,2L,MMNo1
 %u| 345o1 %u| 345o1 %u| 345o1 el#o1 el#o1 o1 TNo1 c]o1 %o1 
u|U5<#=>>	?o1 o1 o1 ^o1b -Y,STBBBN !'&+#'#'+/$(&*&*X X<X elE%,2L,MMNX 	X
 X  $X %X %X X "%X 5/X UOX UOX 
u|U\)	*X X X X4     r=   r  r   c                    t          |           t          |          k    rt          d          t          j        |           }d|d<   t          j        | dd          | d d         z
  |dd          |d d         z
  z  d          |dd <   |S )Nz9The length of qo_indptr and kv_indptr should be the same.r   r   rq   r  )r   r   r  s      r;   _compute_mask_indptrr  	  s     9~~Y''TUUU"9--KKNl	1223B3	'IabbMIcrcN,JK	 KO r=   c            8       >   e Zd ZdZe	 	 	 	 	 	 	 	 	 dHdej        deded	e	ej                 d
e	ej                 de	ej                 de	ej                 dede	e
e                  de	eeef                  ddfd            Zedefd            Zdej        ddfdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dIdej        dej        dededede	e         de	ej                 d e	ej                 d!ed"ed#ed$ed%e	e         d&e	e         d'e	e         d(e	e         d)eeej        f         d*e	eeej        f                  d+e	eeej        f                  d,ed-e	ej                 d.e	ej                 d/ed0e	ej                 d1e	e         d2eddf6d3            ZeZ	 	 	 	 	 	 	 	 dJd4ej        d5ej        d6ej        d!ed"ed#ed$ed%e	e         d&e	e         d'e	e         d(e	e         dej        fd7Zeddddd8d4ej        d5ej        d6ej        d9e	ej                 d:e	ej                 d;ed         d<e	e         dej        fd=            Zeddddd8d4ej        d5ej        d6ej        d9e	ej                 d:e	ej                 d;ed         d<e	e         deej        ej        f         fd>            Zeddddddddd?d4ej        d5ej        d6ej        d@e	e         dAe	e         dBe	e         dCe	e         d9e	ej                 d:e	ej                 d;ed<e	e         deej        eej        ej        f         f         fdD            Z ej        edE          Z	 	 	 	 	 	 	 	 dJd4ej        d5ej        d6ej        d!ed"ed#ed$ed%e	e         d&e	e         d'e	e         d(e	e         deej        ej        f         fdFZdKdGZ dS )L$BatchPrefillWithRaggedKVCacheWrappera  Wrapper class for prefill/append attention with ragged (tensor) kv-cache for
    batch of requests.

    Check :ref:`our tutorial <kv-layout>` for ragged kv-cache layout.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 16
    >>> head_dim = 128
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> prefill_wrapper = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> nnz_kv = 100
    >>> nnz_qo = 100
    >>> qo_indptr = torch.tensor(
    ...     [0, 33, 44, 55, 66, 77, 88, nnz_qo], dtype=torch.int32, device="cuda:0"
    ... )
    >>> kv_indptr = qo_indptr.clone()
    >>> q_at_layer = torch.randn(num_layers, nnz_qo, num_qo_heads, head_dim).half().to("cuda:0")
    >>> k_at_layer = torch.randn(num_layers, nnz_kv, num_kv_heads, head_dim).half().to("cuda:0")
    >>> v_at_layer = torch.randn(num_layers, nnz_kv, num_kv_heads, head_dim).half().to("cuda:0")
    >>> # create auxiliary data structures for batch prefill attention
    >>> prefill_wrapper.plan(
    ...     qo_indptr,
    ...     kv_indptr,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     causal=True,
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = q_at_layer[i]
    ...     k = k_at_layer[i]
    ...     v = v_at_layer[i]
    ...     # compute batch prefill attention, reuse auxiliary data structures
    ...     o = prefill_wrapper.run(q, k, v)
    ...     outputs.append(o)
    ...
    >>> outputs[0].shape
    torch.Size([100, 64, 128])
    >>>
    >>> # below is another example of creating custom mask for batch prefill attention
    >>> mask_arr = []
    >>> qo_len = (qo_indptr[1:] - qo_indptr[:-1]).cpu().tolist()
    >>> kv_len = (kv_indptr[1:] - kv_indptr[:-1]).cpu().tolist()
    >>> for i in range(batch_size):
    ...     mask_i = torch.tril(
    ...         torch.full((qo_len[i], kv_len[i]), True, device="cuda:0"),
    ...         diagonal=(kv_len[i] - qo_len[i]),
    ...     )
    ...     mask_arr.append(mask_i.flatten())
    ...
    >>> mask = torch.cat(mask_arr, dim=0)
    >>> prefill_wrapper.plan(
    ...     qo_indptr,
    ...     kv_indptr,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     custom_mask=mask
    ... )
    >>> outputs_custom_mask = []
    >>> for i in range(num_layers):
    ...     q = q_at_layer[i]
    ...     k = k_at_layer[i]
    ...     v = v_at_layer[i]
    ...     # compute batch prefill attention, reuse auxiliary data structures
    ...     o_custom = prefill_wrapper.run(q, k, v)
    ...     assert torch.allclose(o_custom, outputs[i], rtol=1e-3, atol=1e-3)
    ...
    >>> outputs_custom_mask[0].shape
    torch.Size([100, 64, 128])


    Note
    ----
    To accelerate computation, FlashInfer's batch prefill/append attention operators
    create some auxiliary data structures, these data structures can be reused across
    multiple prefill/append attention calls (e.g. different Transformer layers). This
    wrapper class manages the lifecycle of these data structures.
    r   FNr   r   r   r  r  kv_indptr_bufr!  r"  rd   r#  r$  r   c                 L   t          |           |	.|
i }
t          |	d         t          |g|	R i |
          | _        nd| _        || _        || _        |j        | _        t          j        dt          j	        | j                  | _
        t          j        | j
        j        t          j	        dd          | _        || _        |rt          j        |          st          d          t          j        |          st          d	          t!          |          d
z
  | _        t!          |          | j        d
z   k    r5t          d                    t!          |          | j                            || _        || _        || _        || _        d| _        || _        d| _        dS )ac
  Constructor of :class:`BatchPrefillWithRaggedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.

        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.

        use_cuda_graph : bool
            Whether to enable CUDA graph capture for the prefill kernels, if enabled, the
            auxiliary data structures will be stored as the provided buffers.

        qo_indptr_buf : Optional[torch.Tensor]
            The user reserved GPU buffer to store the ``qo_indptr`` array, the size of the buffer
            should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.

        kv_indptr_buf : Optional[torch.Tensor]
            The user reserved GPU buffer to store the ``kv_indptr`` array, the size of the buffer
            should be ``[batch_size + 1]``.
            This argument is only effective when ``use_cuda_graph`` is ``True``.

        custom_mask_buf : Optional[torch.Tensor]
            The user reserved GPU buffer to store the custom mask tensor, should be large
            enough to store the maximum possible size of the packed custom mask tensor during the
            lifetime of the wrapper. This argument is only effective when ``use_cuda_graph``
            is ``True`` and custom mask will be used in attention computation.

        mask_indptr_buf : Optional[torch.Tensor]
            The user reserved GPU buffer to store the ``mask_indptr`` array, the size of the buffer
            should be ``[batch_size]``.
            This argument is only effective when ``use_cuda_graph`` is ``True`` and custom mask
            will be used in attention computation.

        backend : str
            The implementation backend, could be ``auto``/``fa2``/``fa3`` or ``cutlass``.
            Defaults to ``auto``.
            If set to ``auto``, the wrapper will automatically choose the backend based on the
            device architecture and kernel availability.

        jit_args : Optional[List[Any]]
            If provided, the wrapper will use the provided arguments to create the JIT module,
            otherwise, the wrapper will use default attention implementation.

        jit_kwargs : Optional[Dict[str, Any]]
            The keyword arguments to create the JIT module, defaults to None.
        Nr   r'  r   Tr(  )r   r*  rG   z9qo_indptr_buf should be a torch.Tensor in cuda graph modez9kv_indptr_buf should be a torch.Tensor in cuda graph moder   zJThe length of kv_indptr_buf ({}) should be the same as qo_indptr_buf ({}).)r!   r   ro   r+  r,  r-  rG   r7   r   r3  r4  r   r5  r6  r7  rK   r  r8  rv  r9  _kv_indptr_bufr=  r>  r?  r@  rB  )rF  r   r   r  r  r  r!  r"  rd   r#  r$  s              r;   rG  z-BatchPrefillWithRaggedKVCacheWrapper.__init__|	  s   B 	###!
;27TXTTTTT   D
  $D#'=$,3%*[ek$+&
 &
 &
" 16&,+	1
 1
 1
-  . 	?=11  O   ?=11  O   &)%7%7!%;D"=!!T%;a%??? `ggM**D,B    ,+ / /26 "r=   c                     | j         S rO   rI  rJ  s    r;   rK  z:BatchPrefillWithRaggedKVCacheWrapper.is_cuda_graph_enabled	  rL  r=   c                     || _         || _        t          j        | j        j        | j        j        dd          | _        dS rN  rO  rP  s      r;   rQ  z;BatchPrefillWithRaggedKVCacheWrapper.reset_workspace_buffer	  rR  r=   r   rq   rS  Tr   r   r   r   r   rB   rC   r   r   r  rD   rH   r   r   r   r   r   rT  rU  rV  rW  rX  rY  r   rZ  r^  r_  c                 <
   t          |          }||}t          |          }||}t          |          }||}|d}|d}t          |          dz
  }t          |          |dz   k    rt          d          ||t          ||          }|<|:t	          |                                                    d          |d          \  }}|                    d          }|                    d          }t          |d                   }| j	        r5| j
        || _
        n3|| j
        k    r(t          d	                    || j
                            || j        k    r(t          d
                    || j                            | j                            ||           | j                            ||           |t!          j        | j                  st          d          t!          j        | j                  st          d          || j        dt          |          <   | j                            ||           n|                    | j        |          | _        |                    | j        |          | _        |B|                    | j        |          | _        |                    | j        |          | _        || _        || _        || _        |dd         |dd         z
  } || _        || _        || _        || _        | j        | j        | _        n| j        dk    r5t?          | j        t@          |
         j!        || j        du||          | _        ||||j"        ||t@          |
         j!        |dk    |dk    |f
}!| j        dk    r-|!dd         |j        fz   |!dd         z   }"tG          |" | _        ntI          | j        g|!R  | _        | j        dk    r\tK          | j        ||||	          | _&        t!          j'        |dd         |dd         z
            (                                | _)        n| j        
J d            | j*        | j+        | j,        ||| | j
        p||||d| j	        |||	|g}#| j        dk    rA|#-                    |pd           |#-                    |           |#-                    d            | j        j.        |# | _&        |	| _/        |
| _0        || _1        || _2        || _3        || _4        || _5        || _6        dS )a  Plan batch prefill/append attention on Ragged KV-Cache for given problem specification.

        Parameters
        ----------
        qo_indptr : torch.Tensor
            The indptr of the query/output tensor, shape: ``[batch_size + 1]``.
        kv_indptr : torch.Tensor
            The indptr of the key/value tensor, shape: ``[batch_size + 1]``.
        num_qo_heads : int
            The number of query/output heads.
        num_kv_heads : int
            The number of key/value heads.
        head_dim_qk : int
            The dimension of the heads on query/key tensor.
        head_dim_vo : Optional[int]
            The dimension of the heads on value/output tensor.
            If not provided, will be set to ``head_dim_qk``.
        custom_mask : Optional[torch.Tensor]
            The flattened boolean mask tensor, shape: ``(sum(q_len[i] * k_len[i] for i in range(batch_size))``.
            The elements in the mask tensor should be either ``True`` or ``False``,
            where ``False`` means the corresponding element in the attention matrix will be
            masked out.

            Please refer to the :ref:`mask layout <mask-layout>` for more details about flattened
            layout of mask tensor.

            When :attr:`custom_mask` is provided, and :attr:`packed_custom_mask` is not, the
            function will pack the custom mask tensor into a 1D packed mask tensor, which introduces
            additional overhead.
        packed_custom_mask : Optional[torch.Tensor]
            The 1D packed uint8 mask tensor, if provided, the :attr:`custom_mask` will be ignored.
            The packed mask tensor is generated by :func:`flashinfer.quantization.packbits`.

            If provided, the custom mask will be added to the attention matrix before softmax
            and after scaling. The mask tensor should be in the same device as the input tensors.
        causal : bool
            Whether to apply causal mask to the attention matrix.
            This argument is ignored if ``mask`` is provided in :meth:`plan`.
        pos_encoding_mode : str
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Default is ``NONE``.
        use_fp16_qk_reduction : bool
            Whether to use f16 for qk reduction (faster at the cost of slight precision
            loss).
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        sm_scale : Optional[float]
            The scale used in softmax, if not provided, will be set to
            ``1.0 / sqrt(head_dim_qk)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.
        q_data_type : Union[str, torch.dtype]
            The data type of the query tensor, defaults to torch.float16.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
        o_data_type : Optional[Union[str, torch.dtype]]
            The data type of the output tensor. If None, will be set to :attr:`q_data_type`.
            For FP8 inputs, this should typically be set to torch.float16 or torch.bfloat16.
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.
        prefix_len_ptr :Optional[torch.Tensor]
            prefix length. A uint32 1D tensor indicating the prefix length of each prompt. The tensor size is equal to the batch size.
        token_pos_in_items_ptr : Optional[torch.Tensor]
            A uint16 1D tensor (it will be converted to uint16 in flashinfer) indicating the token position of each item and started from 0 (delimiter)
            for each item. E.g., if we have 3 items of length 3, 2, 4 respectively for this member. This vector will be looking like
            `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0]` with 4 delimiters indexed as 0. For batch size > 1,
            we will concat them as 1D with zero paddings to make sure each has the same length, the padding length is defined by
            `token_pos_in_items_len` - length of the raw `token_pos_in_items_ptr` for each prompt.
        token_pos_in_items_len : int
            zero padding length for `token_pos_in_items_ptr` to better handle the bsz > 1 case. Still using the above 3,2,4 example.
            If we set `token_pos_in_items_len` to be 20, it will be  `[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0]`
            with 7 padded zeros. (note there're 8 zeros in the end where the first one is the delimiter token 0 in the end of the prompt)
        max_item_len_ptr : Optional[torch.Tensor]
            a uint16 vector contains the max token length of all items for each prompt
        fixed_split_size : Optional[int],
            The fixed split size for split-kv FA2 prefill/decode, in pages. Recommend setting to the average sequence length of your workload.
            When enabled, will lead to deterministic softmax score reduction in the merge_states kernel, and therefore
            batch-size invariant outputs. See https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/
            Note that compatibility with CUDA graph is NOT guaranteed, as even when bs is fixed, kv seq len can change
            and lead to a varied number of launched CTAs.
        disable_split_kv : bool,
            Whether to disable the split-kv for determinism in CUDA Graph, defaults to ``False``.
        Note
        ----
        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this plan call and cached for multiple kernel runs.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.

        The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
        Nrq   r  r   z;The kv_indptr length should be equal to mask_indptr length.r	  r
  r(  rb  zThe batch size should be fixed in cudagraph mode, the runtime batch size {}  mismatches the batch size set during initialization {}.ra  rc  z~mask_indptr_buf must be initialized with a torch.Tensor in cuda graph mode if we use custom mask in the attention computation.r   r   cutlass	   r  r   )7r'   r  rK   r  r   r   r  rp  r   rK  r?  rv  r8  r9  ru  r  r7   r7  r=  r>  rG   rw  rx  ry  rk  rl  rm  rn  r+  rB  r@  r(   r   r^   r   rL   r   fmha_varlen_planrA  rq  rr  _max_qo_lenr-  r4  r5  rW   r   r|  r}  r~  r  r  r  r  r  )$rF  r   r   r   r   rB   rC   r   r   r  rD   rH   r   r   r   r   r   rT  rU  rV  rW  rX  rY  r   rZ  r^  r_  r|   r  r  kv_indptr_hostr  
kv_len_arrr  new_get_module_argsrQ   s$                                       r;   r   z)BatchPrefillWithRaggedKVCacheWrapper.plan
  s   N /{;;&L/==%K.{;;%K#!"!O^^a'
y>>Z!^++M   "&8&D.y)DDK%+*A.>&&((--b11!/ / /+ #e,,"e,,^B/00% (	'/+9(($"::: NNTf&(@O O   T333 OOUv"D$:P P   %%il%KKK%%il%KKK!-t'<== $ U   t'<== $ Y   DV%&?,>(?(?&?@%++Kl+SSS"+,,t{,"V"VD"+,,t{,"V"VD!-(:(=(=Kl )> ) )% )4Kl )7 ) )% $/ $0!#. #ABB'."*==
-'=$'=$!1'"&"2D}&& ;K#$56<))5 ! !  128q !#%O }	)) $BQB'9+;*==PQPRPR@SS $ '67J&K##&>M'$3' ' '# =I%%.#Y	< DO  %y1223B3)GHHMMOOD&224V222,*5(:N*!D$ }%%,2333,---A6d16DO "3&;#' /!%%r=   r   r   r   c                     || _         || _        || _        || _        || _        |	| _        |
| _        || _        |                     |||          S )r  r  rF  r   r   r   r  rD   rH   r   r   r   r   r   s               r;   r  z,BatchPrefillWithRaggedKVCacheWrapper.forward:  sX     "3&;#' /!%%xx1a   r=   )r   r   r   r   r   r   r   r   c                    d S rO   rP   	rF  r   r   r   r   r   r   r   rQ   s	            r;   r   z(BatchPrefillWithRaggedKVCacheWrapper.runS  s	     sr=   c                    d S rO   rP   r  s	            r;   r   z(BatchPrefillWithRaggedKVCacheWrapper.run`  s	     -0Cr=   )r  r  r  o_scaler   r   r   r   r  r  r  r  c                   |t          |j                  }t          ||| j        | j                   | j        }| j        }| j        }| j        }| j	        }|d}|*dt          j        |                    d                    z  }|d}|d}|
r|	Ot          j        |                    d          |                    d          ft          j        |j                  }	nJt!          |	|                    d          |                    d          ft          j        |j        d	           |]|j        j        dk    rt          j        n|j        }t          j        |j        dd         |j        dd         z   ||j                  }n:t!          ||j        dd         |j        dd         z   | j        |j        d
           | j        dk    rBt/          |||| j        | j        | j        | j        |||||| j        ||	          \  }}	|
r||	fn|S t;          |          r|| j        dk    rqt=          j        d           |                     t          j!                  }|                     t          j!                  }|                     t          j!                  }| j"        tF          j$        j%        }n*| j        rtF          j&        j%        }ntF          j'        j%        }| j(        | j)        | j        |||| j        | j        ||	|tT          | j+                 j%        ||g}| j,        #|-                    t]          |                     n}|| j"        | j/        ta          |j        d         | j                  | j1        | j2        | j3        ||||| j4        gz  }t;          |          r"|-                    t]          |                     | j5        
J d             | j5        j6        |  |
r||	fn|S )aO  Compute batch prefill/append attention between query and kv-cache stored as
        ragged tensor.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor, shape: ``[qo_indptr[-1], num_qo_heads, head_dim_qk]``
        k : torch.Tensor
            The key tensor, shape: ``[kv_indptr[-1], num_kv_heads, head_dim_qk]``
        v : torch.Tensor
            The value tensor, shape: ``[kv_indptr[-1], num_kv_heads, head_dim_vo]``
        *args
            Additional arguments for the custom kernel.
        q_scale: Optional[float]
            The calibration scale of fp8 query, if not provided, will be set to ``1.0``.
        k_scale: Optional[float]
            The calibration scale of fp8 key, if not provided, will be set to ``1.0``.
        v_scale: Optional[float]
            The calibration scale of fp8 value, if not provided, will be set to ``1.0``.
        o_scale: Optional[float]
            The calibration scale of output, if not provided, will be set to ``1.0``.
        out : Optional[torch.Tensor]
            The output tensor, if not provided, will be allocated internally.
        lse : Optional[torch.Tensor]
            The log-sum-exp of attention logits, if not provided, will be allocated internally.
        return_lse : bool
            Whether to return the logsumexp of attention output
        enable_pdl : bool
            Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
            Only supported for >= sm90, and currently only for FA2 and CUDA core decode.
        Returns
        -------
        Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            If :attr:`return_lse` is ``False``, the attention output, shape: ``[qo_indptr[-1], num_qo_heads, head_dim_vo]``.
            If :attr:`return_lse` is ``True``, a tuple of two tensors:

            * The attention output, shape: ``[qo_indptr[-1], num_qo_heads, head_dim_vo]``.
            * The logsumexp of attention output, shape: ``[qo_indptr[-1], num_qo_heads]``.
        Nr  r5   rq   r  r   r   r   r   r   r  )
	plan_infor  r   r  r  r  r  
max_qo_lenr   r   r   zOur current prefill kernel implementation needs f16 input, the f8 inputs  are casted to f16, which could result in performance degradation.r  )7r)   rG   r    rw  rx  r  r  r  r  r  r  r  r   r7   r   r   r#   r   itemsizebfloat16r   ry  r@  fmha_varlenr9  r  rA  r|  r  r+   loggingwarningrp  rS  r=  r   r  r^   r  r  r-  r4  r   r,  r+  r  rV   r>  r$   rk  rl  rn  rm  rB  r   )rF  r   r   r   r  r  r  r  r   r   r   r   rQ   r   r   r   r   r   r  r   r  s                        r;   r   z(BatchPrefillWithRaggedKVCacheWrapper.runm  s   n +AH55J#q$*D,E	
 	
 	
 '/>%
%
"!OTYqvvbzz222HJJ 	{kVVAYYq		*%-   )!&&))QVVAYY/%   ;*+'*:a*?*?QWI+qwrss|+x  CC %qwrss|+(   =I%%"##/|!+  HC" ",4C::4 Q<< 	$DMU22OU   U]##AU]##AU]##A , -II| 6$O1		$/5	 (&O)/
  'OODJJ''''%%+AGAJDD$,&, H || ,T

+++"..0R...&&11'0SzzS0r=   r  c                     || _         || _        || _        || _        || _        |	| _        |
| _        || _        |                     |||          S )r  r  r  s               r;   r  z7BatchPrefillWithRaggedKVCacheWrapper.forward_return_lse!  sZ     "3&;#' /!%%""1a+++r=   c                     dS r  rP   rJ  s    r;   r  z0BatchPrefillWithRaggedKVCacheWrapper.end_forward:  r  r=   )	r   FNNNNr   NN)NNNFr   Frq   NNNNrS  NNTNNr   NNF)Fr   Frq   NNNNr  )!r  r  r  r  r   r7   r8   r  r   r   r   r   r   rG  r  rK  rQ  r   r9   r
   r   r   r  r  r   r   r   r	   r`   r  r  r  r  rP   r=   r;   r  r  !	  s       X Xt  $04042626(,/3q# q# %q# q# 	q#
  -q#  -q# "%,/q# "%,/q# q# 49%q# T#s(^,q# 
q# q# q# ^q#f $t $ $ $ X$
&+l
	
 
 
 
0  &*.259!'&++/$(&*&*/8:>9=!159=&'37*.!&7i& i&<i& <i& 	i&
 i& i& c]i& el+i& %U\2i& i& i&  $i& i& "%i& 5/i&  UO!i&" UO#i&$ 3+,%i&& uS%+%567'i&( eC$456)i&* +i&, !.-i&. !) 6/i&0 !$1i&2 #5<03i&4 #3-5i&6 7i&8 
9i& i& i& ^i&V	 M !'&++/$(&*&*! !<! <! <	!
 ! !  $! ! "%! 5/! UO! UO! 
! ! ! !2  '+&*%*%)
 
 
<
 <
 <	
 el#
 el#
 EN
 TN
 

 
 
 X
  '+&*$(%)
0 
0 
0<
0 <
0 <	
0 el#
0 el#
0 DM
0 TN
0 
u|U\)	*
0 
0 
0 X
0  $(#'#'#'&*&* %)o1 o1 o1<o1 <o1 <	o1 %o1 %o1 %o1 %o1 el#o1 el#o1 o1 TNo1 
u|U5<#=>>	?o1 o1 o1 ^o1b -Y,STBBBN !'&++/$(&*&*, ,<, <, <	,
 , ,  $, , "%, 5/, UO, UO, 
u|U\)	*, , , ,2     r=   r  qo_segment_offsetskv_segment_offsetsr   c                    t           j                            |j                  j        }t          j        |dz   |j        t           j                  }t          j        d|j        t           j                  }t          j        d|j        t           j                  }t          j        d|j        t           j                  }	|                     ||||||	d|||
  
         ||||	fS )Nr   rG   r   i      )r7   cudaget_device_propertiesrG   multi_processor_countr   r1  r   )
r   r  r  r   r  num_ctaswork_indptrqo_tile_indiceshead_indicesbatch_indicess
             r;   r  r  ?  s    z//!   +1/6ek  K k)0  O ;)0  L K)0  M KK   		 r=   r  r  r   r   r  r  r  r  c                     d S rO   rP   r   r   r   r  r  r  r  r   r   r  r   r  r  r  r  r   s                   r;   r  r  i  s	    $ 3r=   c                     d S rO   rP   r  s                   r;   r  r  ~  s	    $ ),r=   c                    t          dd| j                  }t          | j        |j        |j        t          j        | j        d         |j        d         t          j        j	        dd| j        
  
        }| j        \  }}}|j        \  }}}|	rdnd}|
dt          j        |          z  }
|d}|d}|d}|d}|}|9t	          j        |dd          |d d         z
                                            }|t          |||||	          }|\  }}}}|h| j        j        dk    rt          j        n| j        }t	          j        |t          |d	          z   ||| j        |
          t          |d	          d          }|)|r't	          j        ||| j        t          j        
          }|                    || ||||||||||||
|||||           ||fS )Nfmha_varlen_cutlass_workspacer   r  Fr   r   r5   rq      r  )r%   rG   rL   r   r7   r1  r   r   r   r^   r  r  rq  rr  r  r  r  r   r   r   )r   r   r   r  r  r  r  r   r   r  r   r  r  r  r  r   ru   r   nnz_qor   rB   nnz_kvr   rC   mask_mode_codeqo_total_lenr  r  r  r  r  s                                  r;   r  r    sK   $ &')918  				
	
"	 F )*%FL+()%FL+ 'QQaN;///LY1!""58J3B38OOPPUUWW
$&(:L&
 
	 	 {&'g&6!&;&;ENN	k3z3///8
 
 
 j#


 
 " {z{k,qxu}
 
 
 JJ			'  , 8Or=   c                      t                      } |                                 }t          |                                            |S rO   )r   rJ   r   r   )r   r   s     r;   get_trtllm_gen_fmha_moduler    s=    
$
&
&C					Bs++--...Ir=   rr   r]   r^   ru   rw   rx   ry   rz   r{   
o_sf_scaler|   r}   r~   r   	is_causalattention_sinksc                 |   | j         d         dk    r"|j         d         dk    r|j         d         dk    s
J d            |t          | j                  }t                      j        }t          | j                  }|Dt          j        | j         d         | j         d         |j         d         | j        | j                  }|r?|=t          j        | j         d         | j         d         | j        t          j	                  }t          |t          j                  r!|j        t          j	        k    sJ |t          z  }t          |t          j                  r|j        t          j	        k    sJ |                                |                                z  } ||| |||||||||	|
|||||||||           |r||fS |S )	a  
    Parameters
    ----------
    query : torch.Tensor
        query tensor with shape [num_tokens, num_heads, head_dim]
    key : torch.Tensor
        key tensor with shape [num_tokens, num_heads, head_dim]
    value : torch.Tensor
        value tensor with shape [num_tokens, num_heads, head_dim]
    workspace_buffer : torch.Tensor
        workspace buffer
    seq_lens : torch.Tensor
        sequence lengths
    max_q_len : int
        max query length
    max_kv_len : int
        max key/value length
    bmm1_scale : Union[float, torch.Tensor]
        scale for bmm1, scale_q * scale_k * 1.0 / (head_dim_qk ** 0.5)
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
    bmm2_scale : Union[float, torch.Tensor]
        scale for bmm2, scale_v
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
    o_sf_scale : float
        scale for output
    batch_size : int
        batch size
    window_left : int
        window left
    cum_seq_lens_q : torch.Tensor
        cumulative sequence lengths for query
    cum_seq_lens_kv : torch.Tensor
        cumulative sequence lengths for key/value
    enable_pdl : bool
        enable pdl
    is_causal : bool
        is causal
    attention_sinks : Optional[torch.Tensor]
        attention sinks
    out : Optional[torch.Tensor]
        output tensor, if not provided, will be allocated with shape [query.shape[0], query.shape[1], value.shape[2]]
    lse : Optional[torch.Tensor]
        lse tensor, if not provided, will be allocated with shape [query.shape[0], query.shape[1]]

    Returns
    -------
    out: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        output torch.Tensor or Tuple[torch.Tensor, torch.Tensor].
        If return_lse is True, the output will be a tuple of two tensors, the first is the output tensor, the second is the lse tensor.
        If return_lse is False, the output will be a single tensor.
    r     r  :currently only support deepseek r1 192 query and 128 valueNr   r   r  )r   r)   rG   r  trtllm_ragged_attentionr*   r7   r   r   r   r6   r8   r   r.  r/  )rr   r]   r^   ru   rw   rx   ry   rz   r{   r  r|   r   r}   r~   r   r  r   r   r   r   r   r   r   s                          r;    trtllm_ragged_attention_deepseekr    s   T ;q>S  SYq\S%8%8U[^s=R=R=RD >S=RR '55
)++CH"5<00H
{kKNKNKN<+
 
 
  
ckkKNKN<-	
 
 
 *el++ (5=0000%'
*el++ 15=0000%++--0@0M0M0O0OONH+  .  Cx
r=   HNDkv_cacherv   r  o_sf_vec_sizer   c                 n	   |t          | j                  }t          |t                    r|\  }}nJ|j        d         dk    r||}}n4|j        d         dk    s
J d            |                    d          \  }}|dk    r,|                    dd          }|                    dd          }t                      j        }t          | j                  }|d	k    s|Jt          |t                    r4| j        t          j        k    s
J d
            |J |dv s
J d            |pd}| j        dd         t          | j        d         d          fz   }t          |t                    rX|j        j        d         t!          | j        d         | j        d         z  |z  d          f}|j        }|j        }|j        }|pd	}n|t!          | j        d         d          t!          | j        d         | j        d         z  |z  d          f}t          j        |t          j        | j                  }d}t          j        |t          j        | j                  }nt+          d|           |d	k    sJ t          |t          j                  sJ t/          ||t          j        | j        d           t/          ||t          j        | j        d           |dk     s||j        d         z   |j        d         k    r.t+          d| d|j        d          d|j        d                    nt          |t          j                  s||J |J d}d}|||j        n| j        }||nt          j        | |          }|| j        t          j        t          j        fvrt+          d|           t/          || j        || j        d           nt+          d|           t          |t          j                  r!|j        t          j        k    sJ |t8          z  }t          |t          j                  r|j        t          j        k    sJ |                                |                                z  } |||| ||||||||||pd|pd||	||
|||||           |d	k    r|nt          |||| j                  S )a  
    Parameters
    ----------
    query : torch.Tensor
        query tensor with shape [num_tokens, num_heads, head_dim]
    kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is "HND",
        or [num_pages, 1 or 2, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is "NHD".
        If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim] if :attr:`kv_layout` is "HND",
        or [num_pages, page_size, num_kv_heads, head_dim] if :attr:`kv_layout` is "NHD".
        The first tensor is the key cache, the second tensor is the value cache.
    workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
        workspace
    block_tables : torch.Tensor
        page_table of kv cache, [batch_size, num_pages]
    seq_lens : torch.Tensor
        A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``
    max_q_len : int
        max sequence length for query
    max_kv_len : int
        max sequence length for kv_cache
    bmm1_scale : Union[float, torch.Tensor]
        fused scale for bmm1 input.
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
    bmm2_scale : Union[float, torch.Tensor]
        fused scale for bmm2 input.
        when using trtllm-gen backend, it can be a torch.Tensor with dtype torch.float32.
    batch_size : int
        batch size
    cum_seq_lens_q : torch.Tensor
        cumulative sequence length for query. shape: ``[batch_size + 1]``
    cum_seq_lens_kv : torch.Tensor
        cumulative sequence length for kv_cache. shape: ``[batch_size + 1]``
    window_left : int = -1
        The left (inclusive) window size for the attention window, when set to ``-1``, the window
        size will be set to the full length of the sequence. Defaults to ``-1``.
    out : Optional[Union[torch.Tensor, FP4Tensor]] = None
        output tensor, if not provided, will be allocated with ``out_dtype``, if ``out_dtype`` is not provided, will use the type of ``query``.
    out_dtype : Optional[Union[torch.dtype, str]] = None
        output dtype, if not provided, will use the type of ``out``. For nvfp4, use string ``nvfp4``.
    o_sf_scale : Optional[float] = None
        scale for nvfp4 output tensor scale factor.
    o_sf_vec_size : Optional[int] = None
        vector size for nvfp4 output tensor scale factor.
    enable_pdl : Optional[bool] = None
        Whether to enable Programmatic Dependent Launch (PDL). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#programmatic-dependent-launch-and-synchronization
        Defaults to ``None``, which means it will be enabled if the device supports PDL.
    kv_layout : str = "HND"
        Layout of kv-cache, can be "HND" or "NHD", default is "HND".
    sinks : Optional[List[torch.Tensor]] = None
        additional value per head in the denominator of the softmax.

    Returns
    -------
    out: Union[torch.Tensor, FP4Tensor]
        output torch.Tensor or FP4Tensor.
    Nr   r  zEWhen kv_cache is a single tensor, the second dimension must be 1 or 2)r  r   r  r  nvfp4z*query must be fp8 when out_dtype is nvfp4.)N   z$only o_sf_vec_size = 16 is supportedr  rq   r      r  r   zInvalid out: r   out_scale_factorzQo_sf_start_index is out of the valid range of out_scale_factor. o_sf_start_index=z, out.shape[0]=z, out_scale_factor.shape[0]=)r   zUnsupported out_dtype: zInvalid out_dtype: g      )r)   rG   r6   rX   r   unbindr  r  r   r*   r   r   r7   float8_e4m3fnr2   r:   r3   scale_start_indexdatar   r3  rK   r8   r#   r   rS  r  r   r   r.  r/  )rr   r  ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r  r  r  r   r   r   rs   rt   r   r   fp4_out_shapefp4_out_scale_shaper  o_sf_start_indexr   s                                r;   "trtllm_batch_context_with_kv_cacher    sk   b '55
(E"" 6#>!!!'WGG>!$)))W *))
  (155GW E##B++##B++)++JH"5<00HG	 1ji6P6P 1{e11118 211 %%%
***,R***%+CRC(HU[_a,H,H+JJc9%% 	4	"Q%+a.8MI1MM#  #y"4(C!,WII[Q--Q%+a.8MI1MM#  %{#5+>u|       !+m5;u|TTTCC2S22333G#####u|,,,,, 	!mU[%,PUVVV L	
 	
 	
 q  #)A,.1A1G1JJJI$4I IEHYq\I I-=-CA-FI I   K 
Iu{	+	+ <y/@!!!$$$%(_		%+I_cc%*:5	*R*R*RU[%-HHHByBBCCC ek9elERRRR:y::;;;*el++ (5=0000%'
*el++ 15=0000%++--0@0M0M0O0OONHd/  6  	s,.>LLr=   	num_headshead_dimrf  scale_softmax
scale_bmm1
scale_bmm2c                    t          | j                  st          d          | j        d         dk    r"|j        d         dk    r|j        d         dk    s
J d            t	                      }| j        t          j        k    }|j        t          j        k    }||n|rdnd}||nd}|	|	nd}	|	                    | ||||||||||	||           |
r||fS |S )	a  
    Parameters
    ----------
    query : torch.Tensor
        query tensor with shape [batch_size, seq_len, num_heads, head_dim]
    key : torch.Tensor
        key tensor with shape [batch_size, seq_len, num_heads, head_dim]
    value : torch.Tensor
        value tensor with shape [batch_size, seq_len, num_heads, head_dim]
    out : torch.Tensor
        output tensor with shape [batch_size, seq_len, num_heads, head_dim]
    return_lse : bool
        whether to return the log-sum-exp of attention output
    num_heads : int
        number of heads
    head_dim : int
        head dimension
    seq_len : int
        sequence length
    scale_softmax : float
        scale for softmax
    scale_bmm1 : Optional[float]
        scale for bmm1
    scale_bmm2 : Optional[float]
        scale for bmm2
    lse : Optional[torch.Tensor]
        log-sum-exp of attention output
    Returns
    -------
    out: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        output torch.Tensor or Tuple[torch.Tensor, torch.Tensor].
        If return_lse is True, the output will be a tuple of two tensors, the first is the output tensor, the second is the lse tensor.
        If return_lse is False, the output will be a single tensor.
    z9fmha_v2_prefill_deepseek is only supported on SM120 GPUs.r  r  r  r  Nr5   r  )
r.   rG   rK   r   r   r   r7   r  r  r   )rr   r]   r^   r   r  r  rf  r  r  r  r   r   r   is_e4m3is_bf16_outputs                  r;   fmha_v2_prefill_deepseekr  d  s!   b u|,, VTUUU;q>S  SYq\S%8%8U[^s=R=R=RD >S=RR '((FkU00GY%.0N&2w8OC   *53J)53J
JJ    Cx
r=   )F)r   FFFF)NNNNNNFr   r   FNrq   NNNr   F)NNNNNNFr   r   FNrq   NNNr   T)NNNNFNNNNNF)NNNNFNNNNNTr   )rq   NNNNr  NN)NNFN)]r  r`   r  r  typesr   typingr   r   r   r   r   r	   r
   r   r7   api_loggingr   jitr   r   r   r   r   r   r   r   r   r&  r   pager   quantizationr   r   utilsr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r<   ra   r   r   r   rG   rL   rc   r  ro   r   r   r   r   r  r^   r8   r   r9   r  partial'single_prefill_with_kv_cache_return_lser  r  r  r  r  r  r  r  r  r  rP   r=   r;   <module>r(     s0           ! ! ! ! ! ! M M M M M M M M M M M M M M M M M M M M  ' ' ' ' ' '
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5 4 4 4 4 4       4 4 4 4 4 4 4 4                                                     :" " "(  #(C C[CkC [C {	C
 C C C C C LC  C C C CD  @   $ %"'') ))	) [) k	)
 [) K) ) ) "#Y) #3i) "#Y) #3i) ) ) )  !)" #)$  %)& ') ) ) )X J J JZ o3 o3 o3d V V Vr Qc Qs Q Q Q Qh  (.) ) ))|) |) |	) ) ) ) ) 5<u|U\9::;) ) ) )D 

 '+&*&*%)*.15#"' $'+"&"&!&) || | el#	
 el# el# ek" %,' !.      uo   e_!" #$ %& '( )* \+   
0 

 '+&*&*%)*.15#"' $'+"&"& $), ,|,|, |, el#	,
 el#, el#, ek", %,', !., , , ,  , uo, ,  e_!," #,$ %,& ',( ),* 5<%&+, , , 
,0 
 '+&*&*%)*.15#"' $'+"&"&)k- k-|k-|k- |k- el#	k-
 el#k- el#k- ek"k- %,'k- !.k- k- k- k-  k- uok- k-  e_!k-" #k-$ %k-& 'k-( )k-* 5<u|U\9::;+k- k- k- k-\ +<)*; T+ + + '
|\ "L 	
 \   .w w w w w w w wt|(-
\   [ [ [ [ [ [ [ [|'' ' 	'
 ' ' ' 'T 
 /3 $"&"& $####!&! || | 	
  U\*+  
%,	 
%,	  uo e_ e_ e_ e_  !" \#   
( 
 /3 $"&"& $#### $!, ,|,|, |, 	,
 , U\*+, , 
%,	, 
%,	, , uo, e_, e_, e_, e_,  !," 5<%&#, , , 
,4 /3 $"&"& $####!g g|g|g |g 	g
 g U\*+g g 
%,	g 
%,	g g uog e_g e_g e_g e_g  !g" 5<u|U\9::;#g g g gT    & /3"&"&)C C<C	C <C l	C
 lC C C eU\)*C eU\)*C C C C LC \C C  !C" #C$ el+%C& 
%,	'C( 
%,	)C* 5<u|U\9::;+C C C CL  4837"&#'!%*.)T T<TEL%el(B"CCDT lT ,	T
 lT T T eU\)*T eU\)*T T LT \T T 
%i/0	1T ek3./0T  !T" C=#T$ %T& 'T( D&')T* 5<"#+T T T Tn  #'"&"&O O<O	O <O 
	O
 O O O O O O O 
%,	O 5<u|U\9::;O O O O O Or=   