
    )`i0              !       L   d dl mZ d dlmZ d dlZddlmZ ddlmZ 	 d dl	Z	dZ
n# e$ r dZ	d	Z
Y nw xY wdad
ej        j        fdZ G d de          Zddddddddej        dej        dej        dededee         deej                 deej                 deej                 deej                 deej                 fdZe
r e	j        e	j        j        g           e	j        e          ddddddddej        dej        dej        dededee         deej                 deej                 deej                 deej                 deej                 fd                        Zddddddddddej        dej        dej        dedej        dedeej                 deej                 deej                 dee         deej                 deej                 d eej                 d!eej                 d"ej        d#ej        f d$Zeddd	dddddd%dej        dej        dej        dedej        dedeej                 deej                 d&edeej                 deej                 d eej                 d!eej                 d"eej                 d#ej        fd'            ZdS )(    )Enum)OptionalN   )flashinfer_api   )get_cudnn_fmha_gen_moduleTFstreamc                     t           t          j                    a t          j        t           | j                   t           S )N)_cudnn_handlecudnncreate_handle
set_streamcuda_stream)r	   s    k/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/cudnn/decode.py_create_cudnn_handler      s2    +--	]F$6777    c                   F    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdS )UIDsr   r   r      d   e            2   3   4   i  i  N)__name__
__module____qualname__RESERVED_INVALID_UIDQ_UIDK_UIDV_UIDACTUAL_SEQ_LENS_Q_UIDACTUAL_SEQ_LENS_KV_UIDBLOCK_TABLES_UIDBLOCK_TABLES_K_UIDBLOCK_TABLES_V_UIDRAGGED_Q_UIDRAGGED_O_UIDRAGGED_STATS_UIDO_UID	STATS_UID r   r   r   r      s[        EEE LLEIIIr   r   )
block_sizeactual_seq_lens_qactual_seq_lens_kvblock_tablesbatch_offsets_qbatch_offsets_oqk_cachev_cachescalemax_sequence_kvr0   r1   r2   r3   r4   r5   c                V    d|t          | j                  t          |j                  fS )Ndecode)tupleshape)r6   r7   r8   r9   r:   r0   r1   r2   r3   r4   r5   s              r   _sdpa_decode_key_fnr?   4   s,     	aggm	 r   )
heur_modes)key_fnc                	   t          t          j                                                  }d }	d }
t	          j        |          5 \  }}|                                 dk    r*d}| j        d         | j        d         | j        d         }}}nt|                                 dk    r8| j        d         | j        d         | j        d         | j        d         f\  }}}}n$t          d|                                            |dk    s
J d            |                                dk    s
J d            |j        d         }|	                    d	||||f||z  |||z  dft          j
        j        
          }|	N|                    |	          }|                    t          j        j                   |                    |           |                    |          }|                    |          }|                    t          j        j                   |                    t          j        j                   |                    t          j        j                   ||                    |j        d         d|j        d         d          }|                    |          }|                    t          j        j                   |                    |          }|                    t          j        j                   |9|                    |          }|                    t          j        j                   |N|                    |          }|                    t          j        j                   |                    d           |d u}|                    d|||||nd ||nd |d||||t          j
        j                  \  }}|
N|                    |
          }|                    t          j        j                   |                    |           |                    t          j        j                                      d                               ||||g          !                    ||z  |||z  dg          "                    t          j
        j                   d d d            n# 1 swxY w Y   ||||g}||#                    |           ||#                    |           ||fS )Nr   r   r   r      z#q must have 3 or 4 dimensions, got z"q must have a sequence length of 1zk_cache must have 4 dimensionsr6   )namedimstride	data_typeFsdpaT)rD   r6   kv	seq_len_q
seq_len_kvuse_padding_maskis_inference
attn_scalepaged_attention_k_tablepaged_attention_v_tablepaged_attention_max_seq_len_kvcompute_data_type)$r   torchcudacurrent_streamr   graphrE   r>   
ValueErrortensorrG   BFLOAT16tensor_likeset_uidr   r*   valueset_ragged_offsetr"   r#   r$   reshaper(   r)   r%   r&   set_is_pass_by_valuerH   FLOATr+   r-   
set_outputset_dim
set_strideset_data_typeappend) r6   r7   r8   r9   r:   r0   r1   r2   r3   r4   r5   handleg_s_qobh_qod_qkd_vocudnn_qragged_qcudnn_k_cachecudnn_v_cachend_block_tablescudnn_k_block_tablescudnn_v_block_tablescudnn_actual_seq_lens_qcudnn_actual_seq_lens_kvpadding_maskOragged_otensors_to_returns                                    r   _build_decode_graphr|   L   s     &ej&?&?&A&ABB [   W	FQuuww!|| !
AGAJ
4AGAJGAJGAJGAJ	'#4tt !!Pquuww!P!PQQQ1999B999;;==A%%%'G%%%=#DhhdD)tT4$;:/2	   G *==99  !2!8999))(333MM'22MMM'22MOODJ,---!!$*"2333!!$*"2333'"."6"6 &q)1l.@.CQ# # ()}}_'E'E$$,,T-D-JKKK'(}}_'E'E$$,,T-D-JKKK ,*+--8I*J*J''//0J0PQQQ!-+,==9K+L+L((001L1RSSS(==eDDD-T9L66/@/L++RV 1C0N,,TX!-! (<(</>"'/"7#   DAq( *==99  !2!8999##H---IIdj&''22488@@D$% j$+tTD[!<==mm(? ? ?kW	 W	 W	 W	 W	 W	 W	 W	 W	 W	 W	 W	 W	 W	 W	r %m]AF($$%<===)$$%=>>>###s   QR//R36R3)r1   r2   r3   r0   r4   r5   batch_offsets_kbatch_offsets_vworkspace_bufferr}   r~   outreturnc       
         f   t          | ||||||||	|
|
nd |
|
nd           \  }}t          t          j                                                  }t
          j        j        | t
          j        j        |t
          j	        j        |t
          j
        j        |i}|||t
          j        j        <   |||t
          j        j        <   |
|
|t
          j        j        <   |||t
          j        j        <   |(||t
          j        j        <   ||t
          j        j        <   |                    |||           |S )N)r6   r7   r8   r9   r:   r1   r2   r3   r0   r4   r5   )	workspacerg   )r|   r   rT   rU   rV   r   r"   r]   r#   r$   r-   r%   r&   r*   r+   r(   r)   execute)r6   r7   r8   r9   r   r:   r1   r2   r3   r0   r4   r5   r}   r~   r   rW   tensorshandle_var_maps                      r   _batch_decode_with_kv_cacher      s;   $ )
'+-!+:+FD+:+FD  NE7 #5:#<#<#>#>??G 	
!
'
'
#	G $4E*01%5G+12"+:!'("+:!'(1='-.1='-.	MM'%5gMFFFJr   )r2   r3   is_cuda_graph_compatibler4   r5   r}   r~   r   r   c       	            | j         d         }| j         d         }|j         d         }|#t          j        |||| j        | j                  }t
          sG|                    | j        d          }t                      j        } ||| |||||||||	|
|           nSt          j	        |dddf| j        t          j
                  }|j         d         }t          | |||||||||	|
||	           |S )
a)  Performs batched decode attention with paged KV cache using cuDNN.

    Args:
        q: Query tensor of shape (batch_size, num_heads_qo, head_dim), seq_len_q is the maximum sequence length of queries in the batch
        k_cache: Key cache tensor of shape   (total_num_pages, num_heads_kv, page_size, head_dim)
        v_cache: Value cache tensor of shape (total_num_pages, num_heads_kv, page_size, head_dim)
        scale: Scaling factor for attention scores, typically 1/sqrt(head_dim)
        workspace_buffer: Workspace buffer for cuDNN operations. Scales with batch size. 128 MB should be sufficient for most cases
        max_sequence_kv: Maximum number of tokens per key/value sequence (s_kv_max)
        actual_seq_lens_kv: Actual sequence lengths for key/values per batch, shape (batch_size,) on CPU
        block_tables: Page table mapping for KV cache, shape (batch_size, num_pages_per_seq) on GPU
        is_cuda_graph_compatible: Whether the decode operation is compatible with CUDA graph
        batch_offsets: Optional batch offsets tensor of shape (batch_size,) on GPU
        out: Optional pre-allocated output tensor
        batch_offsets_q: Optional batch offsets for query tensor of shape (batch_size,) on GPU
        batch_offsets_o: Optional batch offsets for output tensor of shape (batch_size,) on GPU
        batch_offsets_k: Optional batch offsets for key tensor of shape (batch_size,) on GPU
        batch_offsets_v: Optional batch offsets for value tensor of shape (batch_size,) on GPU

    Returns:
        Output tensor of shape (batch_size, num_heads_qo, head_dim)

    Note:
        Currently only supports causal attention (causal must be True)
        All tensors must be contiguous and on the same CUDA device
        Query and KV heads can have different sizes (num_heads_qo >= num_heads_kv)
    r   r   r   N)devicedtypeT)non_blockingr   )r6   r7   r8   r9   r   r:   r1   r2   r3   r4   r5   r0   r   )r>   rT   emptyr   r   CUDNN_AVAILABLEtor   r<   onesint32r   )r6   r7   r8   r9   r   r:   r2   r3   r   r4   r5   r}   r~   r   bsrl   rn   actual_seq_lens_kv_gpurun_funcr1   r0   s                        r    cudnn_batch_decode_with_kv_cacher      s:   \ 
B71:D=D
{k"dDIII '
!3!6!6qxd!6!S!S,..5"$	
 	
 	
 	
  "JAqM!(%+
 
 
 ]1%
#-+/1%++!	
 	
 	
 	
  Jr   )enumr   typingr   rT   api_loggingr   utilsr   r   r   ImportErrorr   rU   Streamr   r   Tensorfloatintr?   jit	heur_modeAgraph_cacher|   r   boolr   r/   r   r   <module>r      s                ( ( ( ( ( ( , , , , , ,LLLOO   EOOO
 !2        4   : !"0415+/.2.2  |\ \ 	    - !. 5<( el+ el+   ,  x$UY5?,-...U1222 %&4859/32626t$ t$ t$<t$t$ t$ 	t$ t$ SMt$ $EL1t$ %U\2t$ u|,t$ "%,/t$ "%,/t$ t$ t$ 32 /.t$~ 1515+/ !.2.2.2.28 8 8|8\8 \8 	8
 l8 8  -8 !.8 5<(8 8 el+8 el+8 el+8 el+8  
!8" \#8 8 8 8v  26+/%*.2.2.2.2"&] ] ]|]\] \] 	]
 l] ] !.] 5<(] #] el+] el+] el+] el+] 
%,	]  \!] ] ] ] ] ]s   % 	11