
    )`i+k              9          U d dl mZ d dlmZ d dlZddlmZ ddlmZ 	 d dl	Z	dZ
n# e$ r dZ	d	Z
Y nw xY wdai Zeej        ej        f         ed
<   dej        fdZdej        j        fdZ G d de          Zddddddd	ddddddddddej        dej        dej        dedee         dee         deej                 dej        deej                 dee         dee         dee         deej                 deej                 d eej                 d!eej                 d"eej                 d#eej                 d$eej                 d%eej                 f(d&Ze
r e	j        e	j        j        g'           e	j        e(          ddddddd	ddddddddd)dej        dej        dej        dedee         dee         deej                 deej                 deej                 dee         dee         deej                 deej                 d eej                 d!eej                 d"eej                 d#eej                 d$eej                 d%eej                 f&d*                        Z ddddddddddddd+dej        dej        dej        ded,ej        d-ededej        dej        deej                 d.eded/eej                 d0eej                 d1eej                 deej                 deej                 d eej                 d!eej                 d"eej                 d#eej                 d$eej                 d%eej                 d2e!ej        ej        f         f0d3Z"edddddddddddd	ddd4dej        dej        dej        ded,ej        d-ededej        dej        deej                 d.eded/eej                 d0eej                 d1eej                 deej                 deej                 d eej                 d!eej                 d"eej                 d#eej                 d$eej                 d5ed6ee#         d%eej                 d2e!ej        eej                 f         f4d7            Z$dS )8    )Enum)OptionalN   )flashinfer_api   )get_cudnn_fmha_gen_moduleTF_dummy_scale_tensorsdevicec                     t                               |           }|Bt          j        dg| t          j                                      dddd          }|t           | <   |S )Ng      ?r
   dtyper   )r	   gettorchtensorfloat32reshape)r
   ts     l/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/cudnn/prefill.py_get_dummy_scale_tensorr      sZ      ((AyL#vU]CCCKKAqRSUVWW'(V$H    streamc                     t           t          j                    a t          j        t           | j                   t           S )N)_cudnn_handlecudnncreate_handle
set_streamcuda_stream)r   s    r   _create_cudnn_handler      s4     +--	]F$6777r   c                   n    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )UIDsr   r   r      d   e            2   3   4   5   6   i  i                          N)__name__
__module____qualname__RESERVED_INVALID_UIDQ_UIDK_UIDV_UIDACTUAL_SEQ_LENS_Q_UIDACTUAL_SEQ_LENS_KV_UIDBLOCK_TABLES_UIDBLOCK_TABLES_K_UIDBLOCK_TABLES_V_UIDRAGGED_Q_UIDRAGGED_O_UIDRAGGED_STATS_UIDRAGGED_K_UIDRAGGED_V_UIDO_UID	STATS_UIDQ_SCALE_UIDK_SCALE_UIDV_SCALE_UIDS_SCALE_UIDS_DESCALE_UIDO_SCALE_UID
S_AMAX_UID
O_AMAX_UID r   r   r    r    )   s        EEE LLLLEIKKKKMKJJJJr   r    )max_token_seq_qmax_sequence_kvactual_seq_lens_qblock_tables	page_sizebottom_right_causal_mask
return_lsebatch_offsets_qbatch_offsets_obatch_offsets_kbatch_offsets_vbatch_offsets_statsoutlseo_data_typeqk_cachev_cachescalerP   rQ   rR   actual_seq_lens_kvrS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   c                H   |j         d         }|                                 dk    r| j         d         | j         d         }}n2|                                 dk    r| j         d         | j         d         }}|                                dk    r|j         d         |j         d         }}n2|                                dk    r|j         d         |j         d         }}||j         d         }	||                                 | j        |                                |||||||d u||
|	f}|S )Nr   r!   r   r      )shapedimr   )r_   r`   ra   rb   rP   rQ   rR   rc   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   graph_bh_qod_qkh_kvd_vokeys                             r   _sdpa_prefill_key_fnrn   K   s   .  %a(Guuww!||WQZd	
AWQZd{{}}]1%w}Q'7d	!		]1%w}Q'7dM!$	 			D  C  Jr   )
heur_modes)key_fn)rP   rQ   rR   rc   rS   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   c                <   t          t          j                            | j                            }|j        d         }|}|}t          j                                        st          d          t          j        
                    | j                  }t          j        
                    |j                  }t          j        
                    |j                  }|| j        }t          j        
                    |          }|t          j        j        k    s|t          j        j        k    r:t          j                    dk     r#t          dt          j                               t          j        |          5 \  }}|                                 dk    r| j        d         | j        d         }}nJ|                                 dk    r| j        d         | j        d         }}nt%          d	| j                   |                    d
||||f||z  |||z  df|          }|t          j        j        k    s|t          j        j        k    r|                    dddt          j        j                  } |                    dddt          j        j                  }!|                    dddt          j        j                  }"|                    dddt          j        j                  }#|                    dddt          j        j                  }$|                    dddt          j        j                  }%|                     t,          j        j                   |!                    t,          j        j                   |"                    t,          j        j                   |#                    t,          j        j                   |$                    t,          j        j                   |%                    t,          j        j                   |N|                    |          }&|&                    t,          j        j                   |                     |&           |                                dk    r'|
J d            |j        d         |j        d         }(}'nJ|                                dk    r|j        d         |j        d         }(}'nt%          d|j                   |                                dk    r|                    d||'||f|'|z  |z  |||'z  df|          })|N|                    |          }*|*                    t,          j!        j                   |)                     |*           |                    d||'||(f|'|(z  |z  |(|(|'z  df|          }+|N|                    |          },|,                    t,          j"        j                   |+                     |,           nx|                                dk    r`|                    d|j        |#                                |          })|                    d|j        |#                                |          }+|                    t,          j$        j                   |)                    t,          j%        j                   |+                    t,          j&        j                   ||'                    |j        d         d|j        d         d          }-|                    |-          }.|.                    t,          j(        j                   |                    |-          }/|/                    t,          j)        j                   |N|                    |          }0|0*                    d           |0                    t,          j+        j                   |N|                    |          }1|1*                    d           |1                    t,          j,        j                   |d uo|d u}2|t          j        j-        k    s|t          j        j.        k    rK|/                    d||)|+||0nd ||1nd |2||
|	||.nd ||/nd ||nd t          j        j                  \  }3}4ni|t          j        j        k    s|t          j        j        k    r>|0                    ||)|+| |!|"|#|$|%d||	|2||0nd ||1nd ||.nd ||/nd ||nd           \  }3}4}5}6|5                    t,          j1        j                  2                    d          3                    d          4                    d          5                    t          j        j                   |6                    t,          j6        j                  2                    d          3                    d          4                    d          5                    t          j        j                   |N|                    |          }7|7                    t,          j7        j                   |3                     |7           |N|                    |          }8|8                    t,          j8        j                   |4                     |8           |3                    t,          j9        j                  2                    d          3                    ||||(g          4                    ||(z  |z  |(|(|z  dg          5                    |           |
r|4                    t,          j:        j                  2                    |
          5                    t          j        j                  3                    |||dg          4                    ||z  d|dg           ||)|+|3g}9|
r|9;                    |4           ||9;                    |0           ||9;                    |1           ||9fcd d d            S # 1 swxY w Y   d S )Nr   ztorch is not availablei5f zKFP8 is not supported in cuDNN backend version < 9.17.1, current version is r!   r   r   re   zInvalid query tensor shape: r_   )namerg   stride	data_typeq_scale)r   r   r   r   k_scalev_scales_scale	s_descaleo_scalez+block_tables needs 4 dimensions of kv cachezInvalid kv cache tensor shape: r`   ra   rR   rc   sdpa)rr   r_   kv	seq_len_q
seq_len_kvuse_padding_mask
attn_scalegenerate_statsuse_causal_mask_bottom_rightpaged_attention_k_tablepaged_attention_v_tablepaged_attention_max_seq_len_kvcompute_data_typeT)r_   r|   r}   	descale_q	descale_k	descale_vscale_s	descale_sscale_or   r   r   r   r~   r   r   r   r   F)<r   r   cudacurrent_streamr
   rf   r   	datatypesis_torch_availableRuntimeError_torch_to_cudnn_data_typer   rt   FP8_E4M3FP8_E5M2backend_versiongraphrg   
ValueErrorr   FLOATset_uidr    rG   valuerH   rI   rJ   rK   rL   tensor_liker@   set_ragged_offsetrC   rD   rs   r8   r9   r:   r   r>   r?   set_namer;   r<   BFLOAT16HALFr{   sdpa_fp8rM   
set_outputset_dim
set_strideset_data_typerN   rA   rB   rE   rF   append):r_   r`   ra   rb   rP   rQ   rR   rc   rS   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   handlerh   
graph_s_qo
graph_s_kvcudnn_q_data_typecudnn_k_data_typecudnn_v_data_typecudnn_o_data_typeg_ri   rj   cudnn_qcudnn_q_scalecudnn_k_scalecudnn_v_scalecudnn_s_scalecudnn_s_descalecudnn_o_scaleragged_qrk   rl   cudnn_k_cacheragged_kcudnn_v_cacheragged_vnd_block_tablescudnn_k_block_tablescudnn_v_block_tablescudnn_actual_seq_lens_qcudnn_actual_seq_lens_kvpadding_maskOStatsamax_samax_oragged_oragged_statstensors_to_returns:                                                             r   _build_prefill_graphr      s   0 &ej&?&?&I&IJJ#)!,$
$
1133 	97888!OEEagNN!OEEgmTT!OEEgmTT'K!OEEkRR !999 EO$<<<#%%--w^c^s^u^uww   [   L	(FQuuww!||WQZdAWQZd !I!I!IJJJhhdJ5tT4$;:+	   G "U_%===$(@@@ !"$'#o3	 !) ! ! !""$'#o3	 !) ! ! !""$'#o3	 !) ! ! !""$'#o3	 !) ! ! #$(($$'#o3	 #+ # # !""$'#o3	 !) ! ! %%d&6&<===%%d&6&<===%%d&6&<===%%d&6&<===''(:(@AAA%%d&6&<===*==99  !2!8999))(333{{}}!!#++A ,++ %]1-w}Q/?d!##M!$M!$ 
 !!R7=!R!RSSS{{}}!! !" $
D9 4K*4dD4KK/	 !) ! ! #. }}_==H$$T%6%<===!33H=== !" $
D9 4K*4dD4KK/	 !) ! ! #. }}_==H$$T%6%<===!33H===!## !"">>++/	 !) ! ! !""">>++/	 !) ! ! OODJ,---!!$*"2333!!$*"2333'"."6"6 &q)1l.@.CQ# # ()}}_'E'E$$,,T-D-JKKK'(}}_'E'E$$,,T-D-JKKK ,*+--8I*J*J''001DEEE'//0J0PQQQ!-+,==9K+L+L((112FGGG(001L1RSSS "-P2DD2P 
 "U_%===$(<<<66## -8 0/! .9 10!%1$#-1I0<0H,,d 1=0H,,d '3&>

D&+o&;9 "  55@ "U_%===$(@@@+,::##+++)-)#'$1I%1 -8 0/! .9 10! 1=0H,,d 1=0H,,d '3&>

D? ,6 !, !,(5&&F t455@@GGOO  *\**==9N+O+O+Ot455@@GGOO  *\**==9N+O+O+O*==99  !2!8999##H---". }}-@AA$$T%:%@AAA''555IIdj&''22488@@$
D1 jd"T)4a@ m-... >dn233>> - 566wwdJ28 8*j4/D!<===!(- J 0!((/// ,!(()@AAA!-!(()ABBB''YL	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	( L	(s   <jppp)rS   ru   rv   rw   rW   rX   rY   rZ   r[   r\   r]   r^   workspace_buffermax_token_per_sequencecausalru   rv   rw   returnc                   t          di d| d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|\  }}t          j        j        | t          j        j        |t          j        j        |t          j        j        |i}|||t          j        j        <   |||t          j        j        <   |||t          j	        j        <   |||t          j
        j        <   |||t          j        j        <   |||t          j        j        <   |	(|	|t          j        j        <   |	|t          j        j        <   |r*||t          j        j        <   |||t          j        j        <   |dt#          | j                  }||t          j        j        <   ||t          j        j        <   ||t          j        j        <   ||t          j        j        <   |||t          j        j        <   |||t          j        j        <   t3          t4          j                            | j                            }|                    |||           |r||fS |d fS )Nr_   r`   ra   rb   rP   rQ   rR   rc   rS   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   )	workspacer   rO   )r   r    r8   r   r9   r:   rE   r;   r<   r@   rA   rC   rD   r>   r?   rF   rB   r   r
   rG   rJ   rK   rL   rH   rI   r   r   r   r   execute)r_   r`   ra   rb   r   r   rQ   rR   rc   rS   r   rV   ru   rv   rw   rW   rX   rY   rZ   r[   r\   r]   r^   r   tensorsvar_mapdummy_scale_tensorr   s                               r   _batch_prefill_with_kv_cacher     s   4 *   
!  e	
 /. ( ,+ .- "\ "( : ( ( ( (  0/!" C#$ C%&  K'NE7. 	
!
'
'
#	G $4E*01%5G+12"+:!'("+:!'("+:!'("+:!'(1='-.1='-. G(+$%*3FGD)/04QX>>*1 &'*< &',>"()*< &'*1 &'*1 &'!%*";";AH"E"EFFF	MM'%5fMEEE CxDyr   )rS   ru   rv   rw   rW   rX   rY   rZ   r[   r\   r]   is_cuda_graph_compatiblebackendr^   r   r   c                n   | j         d         }|j         d         }|                                 dk    r| j         d         | j         d         }}n2|                                 dk    r| j         d         | j         d         }}|                                dk    r|j         d         }n%|                                dk    r|j         d         }|r*|(t          j        |||| j        t          j                  }||j         |||fk    rt          d          || j        }|!|||f}t          j        || j        |          }t          rU|d	k    rOt          d'i d
| d|d|d|d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d |S |s
J d!            |d"k    r|	|d#k    r|	
J d$            ||}|
                    | j        d%&          }|
                    | j        d%&          } t                      j        }! |!|||| |||||||| |	|
|||dddd|           ||fS )(a3  Performs batched prefill attention with paged KV cache using cuDNN.

    Args:
        q: Query tensor of shape (Total number of tokens, num_heads_qo, head_dim)
        k_cache: Key cache tensor of shape   (total_num_pages, num_heads_kv, page_size, head_dim) if paged kv cache is enabled else (Total sequence length of kv, num_heads_kv, d_qk)
        v_cache: Value cache tensor of shape (total_num_pages, num_heads_kv, page_size, head_dim) if paged kv cache is enabled else (Total sequence length of kv, num_heads_kv, d_vo)
        scale: Scaling factor for attention scores, typically 1/sqrt(head_dim)
        workspace_buffer: Workspace buffer for cuDNN operations. Scales with batch size. 128 MB should be sufficient for most cases
        max_token_per_sequence: Maximum number of tokens per query sequence (s_qo_max)
        max_sequence_kv: Maximum number of tokens per key/value sequence (s_kv_max)
        actual_seq_lens_q:  Actual number of tokens per query sequence shape (batch_size,) on cpu or device (cpu if cuda_graph is False)
        actual_seq_lens_kv: Actual sequence lengths for key/values per batch, shape (batch_size,) on CPU or device (cpu if cuda_graph is False)
        block_tables: Page table mapping for KV cache, shape (batch_size, num_pages_per_seq) on GPU
        causal: Whether to apply causal masking
        return_lse: Whether to return log-sum-exp values (must be True)
        out: Optional pre-allocated output tensor
        lse: Optional pre-allocated tensor for log-sum-exp values if return_lse is True else returns None
        is_cuda_graph_compatible: Whether the prefill operation is compatible with CUDA graph
        q_scale: Optional scale tensor for query tensor of shape (1, 1, 1, 1) on GPU
        k_scale: Optional scale tensor for key tensor of shape (1, 1, 1, 1) on GPU
        v_scale: Optional scale tensor for value tensor of shape (1, 1, 1, 1) on GPU
        batch_offsets_q: Optional batch offsets for query tensor of shape (batch_size,) on GPU
        batch_offsets_o: Optional batch offsets for output tensor of shape (batch_size,) on GPU
        batch_offsets_k: Optional batch offsets for key tensor of shape (batch_size,) on GPU
        batch_offsets_v: Optional batch offsets for value tensor of shape (batch_size,) on GPU
        o_data_type: Optional data type for output tensor
    Returns:
        Output tensor of shape (batch_size * seq_len_q, num_heads_qo, head_dim)
        If return_lse is True, also returns log-sum-exp tensor of shape (batch_size, seq_len_q, num_heads_qo)

    Note:
        Query and KV heads can have different sizes (num_heads_qo >= num_heads_kv)
        When using cuda graph, actual_seq_lens_q and actual_seq_lens_kv must be on the same device as q
        Head dimension of query and key must be 128 or 192
        Head dimension of value and output must be 128
    r   r!   r   r   re   Nr   zAlse must have shape (num_sequences, max_token_per_sequence, h_qo)cubinr_   r`   ra   rb   r   r   rQ   rR   rc   rS   r   rV   ru   rv   rw   rW   rX   rY   rZ   r[   r\   r]   r^   z)Currently only supports return_lse = True      ziCurrently only supports if d_qk = 192 and block_tables is None or d_qk = 128 and block_tables is not NoneT)non_blockingrO   )rf   rg   r   emptyr
   r   r   r   CUDNN_AVAILABLEr   tor   prefill)"r_   r`   ra   rb   r   r   rQ   rR   rc   rS   r   rV   ru   rv   rw   rW   rX   rY   rZ   r[   r\   r]   r   r   r^   
num_tokensnum_sequencesri   rj   rl   	out_shapeactual_seq_lens_q_gpuactual_seq_lens_kv_gpurun_funcs"                                     r   !cudnn_batch_prefill_with_kv_cacher   *  s   D J%+A.Muuww!||WQZd	
AWQZd{{}}}Q	!		}Q ;+&xm  C 398NPT(UUUO
 
 	
 g
{t,	k)AHKHHH B
7g--+ 
 
 
a
G
 G
 %	

 .-
 $:#9
 ,O
 0/
  21
 &
 6
 "z
 G
 G
 G
  ,O!
" ,O#
$ ,O%
& ,O'
( !4 3)
* +
, -
. $/
 	
4 FFFFFz 4CKKL44w 54
 "4O 1 4 4QXD 4 Q Q!3!6!6qxd!6!S!S,..6"!"$-	
 	
 	
2 8Or   )%enumr   typingr   r   api_loggingr   utilsr   r   r   	Exceptionr   r	   dictr
   Tensor__annotations__r   r   Streamr   r    floatintboolr   rn   jit	heur_modeAgraph_cacher   tupler   strr   rO   r   r   <module>r      s                  ( ( ( ( ( ( , , , , , ,LLLOO   EOOO
 9; d5<56 ; ; ;EL    !2        4   P &*%)04+/#/3!&.2.2.2.226"&"&)-+6 6 6|6\6 \6 	6 c]6 c]6  -6 6 5<(6 }6 'tn6 6 el+6 el+6  el+!6" el+#6$ "%,/%6& 
%,	'6( 
%,	)6* %+&+6 6 6 6r  @(UY5?,-...U2333 *.)-4859/337%*262626266:&*&*-1)|( |( |(<|(|( |( 	|( "#|( "#|( $EL1|( %U\2|( u|,|( #+4.|( TN|( "%,/|( "%,/|( "%,/|(  "%,/!|(" &el3#|($ el#%|(& el#'|(( ek*)|( |( |( 43 /.|(T
 ,0 '+&*&*.2.2.2.226"&"&)-1` ` `|`\` \` 	`
 l`  ` ` |` ` 5<(` ` ` el#` el#`  el#!`" el+#`$ el+%`& el+'`( el+)`* "%,/+`, 
%,	-`. 
%,	/`0 %+&1`2 5<%&3` ` ` `F  ,0 '+&*&*.2.2.2.226"&"&%*!)-5i i i|i\i \i 	i
 li  i i |i i 5<(i i i el#i el#i  el#!i" el+#i$ el+%i& el+'i( el+)i* "%,/+i, 
%,	-i. 
%,	/i0 #1i2 c]3i4 %+&5i6 5<%,//07i i i i i is   & 	22