
    )`i                     6   d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# ej$        d             Z%ej$        d             Z& G d d          Z' G d d          Z(dS )a3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)SimpleNamespace)AnyListOptionalTupleUnion   )flashinfer_api)gen_pod_modulegen_batch_pod_module)get_seq_lens)get_batch_prefill_module)packbits)MaskModePosEncodingModeTensorLayout_check_cached_qkv_data_type_check_kv_layout_check_pos_encoding_mode_get_cache_alibi_slopes_buf_get_cache_buf_get_range_buf_unpack_paged_kv_cachecanonicalize_torch_dtypedevice_support_pdlc                  b    t          |                                  }t          |j                  S N)
run_tensor)r   build_and_loadr   pod_with_kv_cache_tensorargsmodules     b/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/pod.pyget_pod_moduler%   -   s,    T"1133Ff&EFFFF    c                  b    t          |                                  }t          |j                  S r   )r   r   r   batch_pod_with_kv_cache_tensorr!   s     r$   get_batch_pod_moduler)   3   s,    !4(7799Ff&KLLLLr&   c            D       l   e Zd ZdZe	 	 	 	 	 	 dFdej        dedede	ej                 d	e	ej                 d
e	ej                 de	e
e                  ddfd            Zedefd            Zdej        dej        ddfdZe	 	 	 	 	 	 	 	 	 dGdej        dej        dej        dedededededede	eeej        f                  de	eeej        f                  d e	eeej        f                  d!e	e         d"e	e         d#e	e         d$eddf"d%            ZeZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dHd&ej        d'ej        d(ej        d)ej        d*eej        eej        ej        f         f         d+e	ej                 d,e	ej                 d-ed.ed/ed0e	e         d1ed2e	e         d3e	e         d4ed5e	ej                 d6e	ej                 d7ed8ed9ed:e	e         d;ed<e	e         d=e	e         d>e	e         d?e	e         d@e	e         dAedBedCe	e         deej        eej        ej        f         f         f>dD            ZdIdEZdS )JPODWithPagedKVCacheWrapperaM	  Wrapper class for POD-Attention with paged kv-cache (first proposed in
    `<https://arxiv.org/abs/2410.18038>`_) for batch of requests.

    Check :ref:`our tutorial<kv-layout>` for page table layout.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 8
    >>> head_dim = 128
    >>> max_num_pages = 128
    >>> page_size = 16
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> decode_wrapper = flashinfer.PODWithPagedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> kv_page_indices = torch.arange(max_num_pages).int().to("cuda:0")
    >>> kv_page_indptr = torch.tensor(
    ...     [0, 17, 29, 44, 48, 66, 100, 128], dtype=torch.int32, device="cuda:0"
    ... )
    >>> # 1 <= kv_last_page_len <= page_size
    >>> kv_last_page_len = torch.tensor(
    ...     [1, 7, 14, 4, 3, 1, 16], dtype=torch.int32, device="cuda:0"
    ... )
    >>> kv_cache_at_layer = [
    ...     torch.randn(
    ...         max_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> # create auxiliary data structures for batch decode attention
    >>> decode_wrapper.plan(
    ...     kv_page_indptr,
    ...     kv_page_indices,
    ...     kv_last_page_len,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ...     pos_encoding_mode="NONE",
    ...     data_type=torch.float16
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = torch.randn(batch_size, num_qo_heads, head_dim).half().to("cuda:0")
    ...     kv_cache = kv_cache_at_layer[i]
    ...     # compute batch decode attention, reuse auxiliary data structures for all layers
    ...     # TODO_AK: DEMONSTRATE USAGE OF POD
    ...     outputs.append(o)
    ...
    >>> outputs[0].shape
    torch.Size([7, 64, 128])

    Note
    ----
    To accelerate computation, FlashInfer's POD-Attention creates some
    auxiliary data structures, these data structures can be reused across multiple
    batch decode attention calls (e.g. different Transformer layers). This wrapper class
    manages the lifecycle of these data structures.
    NHDFNfloat_workspace_buffer	kv_layoutuse_cuda_graphpaged_kv_indptr_bufferpaged_kv_indices_bufferpaged_kv_last_page_len_bufferjit_argsreturnc                 &   t          |           	 d}d| _        || _        || _        |j        | _        t          j        dt
          j        | j                  | _        t          j        dt
          j        dd          | _	        |rt          j
        |          st          d          t          j
        |          st          d          t          j
        |          st          d	          t          |          | _        t          |          | j        d
z   k    rt          d          nd| _        || _        || _        || _        || _        || _        |r5t          j        | j        d
z   t
          j        |j                  | _        dS dS )a  Constructor of :class:`PODWithPagedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.

        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.

        use_cuda_graph : bool
            Whether to enable CUDAGraph for batch decode attention, if enabled, the
            auxiliary data structures will be stored as the provided buffers. The ``batch_size``
            cannot change during the lifecycle of this wrapper when CUDAGraph is enabled.

        indptr_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the indptr of the paged kv cache, the size
            of the buffer should be ``[batch_size + 1]``.
            Only needed when ``use_cuda_graph`` is ``True``.

        indices_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the page indices of the paged kv cache,
            should be large enough to store the maximum number of page indices
            (``max_num_pages``) during the lifecycle of this wrapper.
            Only needed when ``use_cuda_graph`` is ``True``.

        last_page_len_buffer : Optional[torch.Tensor]
            The user reserved buffer on GPU to store the number of entries in the last page, the
            size of the buffer should be ``[batch_size]``.
            Only needed when ``use_cuda_graph`` is ``True``.

        jit_args : Optional[List[Any]]
            If provided, the wrapper will use the provided arguments to create the JIT module,
            otherwise, the wrapper will use default attention implementation.
        TNi   dtypedevicecpur8   
pin_memoryr9   zApaged_kv_indptr_buffer should be a torch.Tensor in cudagraph modezBpaged_kv_indices_buffer should be a torch.Tensor in cudagraph modezHpaged_kv_last_page_len_buffer should be a torch.Tensor in cudagraph moder	   z;The size of paged_kv_indptr_buffer should be batch_size + 1r   )r   _jit_module
_kv_layout_float_workspace_bufferr9   torchemptyuint8_int_workspace_buffer _pin_memory_int_workspace_buffer	is_tensor
ValueErrorlen_fixed_batch_size_paged_kv_indptr_buf_paged_kv_indices_buf_paged_kv_last_page_len_buf_use_tensor_cores_use_cuda_grapharangeint32_qo_indptr_buf)	selfr-   r.   r/   r0   r1   r2   r3   use_tensor_coress	            r$   __init__z#PODWithPagedKVCacheWrapper.__init__{   s   ` 	###	  ,0#'=$,3%*[ek$+&
 &
 &
" 16+	1
 1
 1
-  	'?#9::  W   ?#:;;  X   ?#@AA  ^   &))F%G%GD")**d.Dq.HHH Q   I
 &'D"$:!%<"+H(!1- 	"',&*k-4# # #D	 	r&   c                     | j         S NrM   rQ   s    r$   is_cuda_graph_enabledz0PODWithPagedKVCacheWrapper.is_cuda_graph_enabled       ##r&   int_workspace_bufferc                     || _         || _        t          j        | j        j        | j        j        dd          | _        dS )a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer, the device of the new float workspace buffer should
            be the same as the device of the input tensors.

        int_workspace_buffer : torch.Tensor
            The new int workspace buffer, the device of the new int workspace buffer should
            be the same as the device of the input tensors.
        r:   T)r8   r9   r<   N)r?   rC   r@   rA   shaper8   rD   )rQ   r-   rZ   s      r$   reset_workspace_bufferz1PODWithPagedKVCacheWrapper.reset_workspace_buffer   sK     (>$%9"05&,,2	1
 1
 1
---r&   NONEfloat16Tindptrindiceslast_page_lennum_qo_headsnum_kv_headshead_dim	page_sizepos_encoding_modewindow_leftq_data_typekv_data_type	data_typesm_scale
rope_scale
rope_thetanon_blockingc                    t          |          }d}t          |dz   d          }| j        r|| j        k    r(t	          d                    || j                            t          |          t          | j                  k    rt	          d          | j                            ||           | j	                            ||           | j        dt          |                                       ||j
        | j
        k    o|           n|                    | j
        |          | _        |                    | j
        |          | _        |                    | j
        |          | _	        |                    | j
        |          | _        |                    d          }|                    d          }||
|}
||}t          |
          }
||
}t          |          }|
| _        || _        t!          |||          }| j        | j        | _        n;t'          d|
||
|j        ||t*          |         j        |	d	k    |d
k    d          | _        | j                            | j        | j        | j        ||||||||| j        ||d|	d	dd
          | _        |j        | _        || _        |	| _        || _        || _         || _!        || _"        dS )ao	  Plan POD's batch decode for given problem specification.

        Parameters
        ----------
        indptr : torch.Tensor
            The indptr of the paged kv cache, shape: ``[batch_size + 1]``
        indices : torch.Tensor
            The page indices of the paged kv cache, shape: ``[qo_indptr[-1]]``
        last_page_len : torch.Tensor
            The number of entries in the last page of each request in the paged kv
            cache, shape: ``[batch_size]``
        num_qo_heads : int
            The number of query/output heads
        num_kv_heads : int
            The number of key/value heads
        head_dim : int
            The dimension of the heads
        page_size : int
            The page size of the paged kv cache
        pos_encoding_mode : str
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Defaults to ``NONE``.
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        q_data_type : Optional[Union[str, torch.dtype]]
            The data type of the query tensor, defaults torch.float16.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to
            ``q_data_type``. Defaults to ``None``.
        data_type: Optional[Union[str, torch.dtype]]
            The data type of both the query and key/value tensors. Defaults to torch.float16.
            data_type is deprecated, please use q_data_type and kv_data_type instead.
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.


        Note
        ----
        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple run calls.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.

        The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
                r	   r:   zThe batch size should be fixed in cudagraph mode, the runtime batch size {}  mismatches the batch size set during initialization {}zHThe size of indices should be less than or equal to the allocated bufferrp   Nfa2r_   r   F)#rG   r   rX   rH   rF   formatrJ   rI   copy_rK   r9   torP   r   _cached_q_data_type_cached_kv_data_typer   r=   _cached_moduler   r8   r   valueplanr?   rC   rD   
_plan_info_indptr_type_pos_encoding_mode_window_left_logits_soft_cap	_sm_scale_rope_scale_rope_theta)rQ   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   
batch_sizelogits_soft_capqo_indptr_hostindptr_hostlast_page_len_hostkv_lens_arr_hosts                          r$   r|   zPODWithPagedKVCacheWrapper.plan  sT   N ''
'
Q>>% 	T333 NNTf"D$:O O   7||c$"<==== ^   %++F+NNN,22L 3    &~W~6<<w~'D&V, =     )/		, )2 ) )D% *1, *4 * *D& 0=/?/?, 0@ 0 0D, #1"3"3, #4 # #D ii&&*--e44 "'#(.{;;&L/==#. $0!'5GSS'"&"2D": 128r!!## #D -22(&1&'
 
, #L"3' /!%%r&   q_pk_pv_pq_dpaged_kv_cache_dcustom_mask_ppacked_custom_mask_pcausal_pkv_layout_ppos_encoding_mode_p
sm_scale_pwindow_left_prope_scale_prope_theta_preturn_lse_pcustom_mask_dpacked_custom_mask_dcausal_dkv_layout_dpos_encoding_mode_d
sm_scale_dwindow_left_drope_scale_drope_theta_dq_scalek_scalev_scalereturn_lse_duse_fp16_qk_reduction
enable_pdlc                    |t          |j                  }d} d}!t          |
           t          |	           t	          dd|j                  }"| d} |*dt          j        |                    d                    z  }|d}|d}|8|6t          |	                                
                    d          d	          }|t          j        j        }#n%|rt          j        j        }#nt          j        j        }#d}$|rNt!          j        |                    d
          |                    d          ft           j        |j                  }$t!          j        |          }%t)          || j                  \  }&}'t-          ||&| j        | j                   | j        }| j        }| j        }!| j        }| j        }| j        }t          |           |!d}!|$|j        d         }(dt          j        |(          z  }|||z  }|||z  }|d}|d}d})|rNt!          j        |                    d
          |                    d          ft           j        |j                  })t!          j        |          }*tA          |j!        |j!        |j!        |j        d         tD          |
         j        |d
k    | d
k    || j#        tD          |         j        |dk    |!d
k              }+ |+j$        g ||||"|%|$|#tJ          |	         j        ||tM          |j        d         |j                  | |d|z  d|z  | j'        | j(        | j)        ||&|'| j*        | j+        | j,        | j-        |*|)t          j        j        tJ          | j                 j        |ddtM          |j        d         |j                  |!|d|z  d|z  |R   ||*|z  }*|%|*fS ).Compute POD-attention for a batch of requests.Npod_with_kv_cache_tmpi   rr         ?r_        @littlebitorderr   r	   r7   ).r   r9   r   r   r   mathsqrtsizer   
contiguousviewr   CUSTOMr{   CAUSAL
NON_CAUSALr@   rA   float32
empty_liker   r>   r   rx   ry   r   r   r   r   r   r   r\   r%   r8   r   r~   r   r   r   r?   rC   r}   rP   rI   rJ   rK   ),rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   logits_soft_cap_plogits_soft_cap_dtmp_pmask_mode_plse_pout_p	k_cache_d	v_cache_drf   lse_dout_dmodule_getters,                                               r$   runzPODWithPagedKVCacheWrapper.run  s?   N +CJ77J !  !4555%%%68H#*UU$ #ty"666JLL$)=)E#+((**//33h$ $ $   +"//KK 8&o3&17 	K!chhqkk*%-
  E  %%  66FXX	9#D4d6O	
 	
 	
 #5) 1^
'' !4555$ #y}Hty222J'!J'!JLL 	K!chhqkk*%-
  E  %%&IIIIbM/06Q!!
  /06R!'
 
* 	!  )	
)	
 )	
 	)	

 )	
 )	
 )	
 )	
 %+)	
 )	
 !)	
 (	!cjAA)	
 )	
 )	
 ,)	
  ,!)	
$ (%)	
& &')	
( O))	
* +)	
, -)	
. /)	
0 1)	
2 %3)	
4 &5)	
6 ,7)	
8 9)	
: ;)	
< %=)	
> )/?)	
@ A)	
B C)	
D E)	
F (	!cjAAG)	
H I)	
J K)	
L ,M)	
N ,O)	
P Q)	
 )	
 )	
 )	
V WEu~r&   c                     dS z7Warning: this function is deprecated and has no effect.N rW   s    r$   end_forwardz&PODWithPagedKVCacheWrapper.end_forwardm      r&   )r,   FNNNN	r^   r_   r`   NNNNNT)NNFr,   r^   Nr_   NNFNNFr,   r^   Nr_   NNNNNFFNr4   N)__name__
__module____qualname____doc__r
   r@   Tensorstrboolr   r   r   rS   propertyrX   r]   intr   r8   floatr|   begin_forwardr   r   r   r   r&   r$   r+   r+   9   s       ? ?B  $9=:>@D(,n n %n n 	n
 !) 6n "*%,!7n (0'=n 49%n 
n n n ^n` $t $ $ $ X$
&+l
JO,
	
 
 
 
0  "(9B:>7;$(&*&*!#i& i&i& i& |	i&
 i& i& i& i& i& i& eC$456i& uS%+%567i& E#u{"234i& 5/i& UOi&  UO!i&" #i&$ 
%i& i& i& ^i&V M 157; #)&*(,(,"047; #)&*(,(,#'#'#'"&+%)Et t \t \	t
 \t \t  eEL%,4N.O OPt  -t 'u|4t t t !t UOt t  uo!t" uo#t$ %t(  -)t* 'u|4+t, -t. /t0 !1t2 UO3t4 5t6 uo7t8 uo9t: %;t< %=t> %?t@ AtB  $CtD TNEtH 
u|U5<#=>>	?It t t ^tl     r&   r+   c            .          e Zd ZdZe	 d3dej        deddfd            Ze	de
fd            Ze	 	 	 	 	 	 	 	 	 d4dej        dej        dej        dej        dej        dej        dej        dej        dededededededeeeej        f                  deeeej        f                  deeeej        f                  dee         dee         d ee         d!e
ddf,d"            ZeZe	 	 	 	 	 	 	 	 	 d5d$ej        d%eej        eej        ej        f         f         d&ej        d'eej        eej        ej        f         f         d(eej                 d)eej                 d*e
d+ee         d,ee         d-ee         d.e
d/e
d0ee
         deeej        ej        f         eeej        ej        f         eej        ej        f         f         f         fd1            Zd6d2ZdS )7BatchPODWithPagedKVCacheWrappera  Wrapper class for POD-Attention with paged kv-cache (first proposed in
    `<https://arxiv.org/abs/2410.18038>`_) for batch of requests.

    Check :ref:`our tutorial<kv-layout>` for page table layout.

    Examples
    --------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 8
    >>> num_qo_heads = 64
    >>> num_kv_heads = 8
    >>> head_dim = 128
    >>> max_num_pages = 128
    >>> device = 0
    >>> page_block_size = 1
    >>> causal = True
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> wrapper = flashinfer.BatchPODWithPagedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> # Prefill and decode parameters
    >>> p_qo_lens = [2048] * 2
    >>> d_qo_lens = [1] * 128
    >>> p_kv_lens = [2048] * 2
    >>> d_kv_lens = [2048] * 128
    >>> # Prefill plan inputs
    >>> p_seq_lens_blocks = torch.ceil(
    ...     torch.tensor(p_kv_lens, dtype=torch.int32) / page_block_size
    ... ).int()
    >>> p_q_indptr = torch.cat(
    ...     [torch.tensor([0]), torch.cumsum(torch.tensor(p_qo_lens), 0)], dim=0
    ... ).int()
    >>> p_kv_indptr = torch.cat(
    ...     [torch.tensor([0]), torch.cumsum(p_seq_lens_blocks, 0)], dim=0
    ... ).int()
    >>> kv_indices_p = torch.arange(0, p_kv_indptr[-1], device=device, dtype=torch.int32)
    >>> last_page_len_p = (p_seq_lens_blocks - 1) % page_block_size + 1
    >>> # Decode plan inputs
    >>> d_seq_lens_blocks = torch.ceil(
    ...     torch.tensor(d_kv_lens, dtype=torch.int32) / page_block_size
    ... ).int()
    >>> d_q_indptr = torch.cat(
    ...     [torch.tensor([0]), torch.cumsum(torch.tensor(d_qo_lens), 0)], dim=0
    ... ).int()
    >>> d_kv_indptr = torch.cat(
    ...     [torch.tensor([0]), torch.cumsum(d_seq_lens_blocks, 0)], dim=0
    ... ).int()
    >>> kv_indices_d = torch.arange(0, d_kv_indptr[-1], device=device, dtype=torch.int32)
    >>> last_page_len_d = (d_seq_lens_blocks - 1) % page_block_size + 1
    >>> # create auxiliary data structures for batch decode attention
    >>> wrapper.plan(
    ...     # Prefill params
    ...     p_q_indptr.to(device),
    ...     p_kv_indptr.to(device),
    ...     kv_indices_p.to(device),
    ...     last_page_len_p,
    ...     # Decode params
    ...     d_q_indptr.to(device),
    ...     d_kv_indptr.to(device),
    ...     kv_indices_d.to(device),
    ...     last_page_len_d,
    ...     # Common params
    ...     num_qo_heads=num_qo_heads,
    ...     num_kv_heads=num_kv_heads,
    ...     head_dim=head_dim,
    ...     page_size=page_block_size,
    ...     q_data_type=torch.bfloat16,
    ...     kv_data_type=torch.bfloat16,
    ... )
    >>> # Prefill input tensors
    >>> q_p = torch.rand(p_q_indptr[-1].item(), num_qo_heads, head_dim).to(
    ...     device, dtype=torch.bfloat16
    ... )
    >>> kv_p = torch.randn(p_kv_indptr[-1], 2, page_block_size, num_kv_heads, head_dim).to(
    ...     device, dtype=torch.bfloat16
    ... ).unbind(1)
    >>> # Decode input tensors
    >>> q_d = torch.rand(d_q_indptr[-1].item(), num_qo_heads, head_dim).to(
    ...     device, dtype=torch.bfloat16
    ... )
    >>> kv_d = torch.randn(d_kv_indptr[-1], 2, page_block_size, num_kv_heads, head_dim).to(
    ...     device, dtype=torch.bfloat16
    ... ).unbind(1)
    >>> for i in range(num_layers):
    ...     o_p_batch, o_d_batch = wrapper.run(
    ...         q_p,
    ...         kv_p,
    ...         q_d,
    ...         kv_d,
    ...         causal_p=causal,
    ...     )
    >>> print(o_p_batch.shape, o_d_batch.shape)
    torch.Size([4096, 64, 128]) torch.Size([128, 64, 128])

    Note
    ----
    To accelerate computation, FlashInfer's POD-Attention creates some
    auxiliary data structures, these data structures can be reused across multiple
    batch decode attention calls (e.g. different Transformer layers). This wrapper class
    manages the lifecycle of these data structures.
    r,   r-   r.   r4   Nc                    t          |           d}d| _        || _        t          j        |dd          \  }}|| _        || _        |j        | _        t          j        dt          j	        | j                  | _
        t          j        dt          j	        | j                  | _        t          j        dt          j	        dd	          | _        t          j        dt          j	        dd	          | _        t          j                            | j                  }t          j        |j        dz   t          j        | j                  | _        d| _        d| _        d| _        d| _        || _        d
| _        dS )a  Constructor of :class:`BatchPODWithPagedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.

        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.

        TN   r   )dimr6   r7   r:   r;   F)r   r=   r>   r@   chunk_float_workspace_buffer_p_float_workspace_buffer_dr9   rA   rB   _int_workspace_buffer_p_int_workspace_buffer_d"_pin_memory_int_workspace_buffer_p"_pin_memory_int_workspace_buffer_dcudaget_device_propertiesmulti_processor_countr   _sm_aware_schedrH   rI   rJ   rK   rL   rM   )rQ   r-   r.   rR   float_workspace_buffer_pfloat_workspace_buffer_ddev_props          r$   rS   z(BatchPODWithPagedKVCacheWrapper.__init__  s   & 	###,0#=B["A1>
 >
 >
: ": *B&)A&.5',{ek$+(
 (
 (
$ (-{ek$+(
 (
 (
$ 38++	3
 3
 3
/ 38++	3
 3
 3
/ :33DK@@${+a/	$+ 
  
  
 "#$(!%)"+/(!1$r&   c                     | j         S rU   rV   rW   s    r$   rX   z5BatchPODWithPagedKVCacheWrapper.is_cuda_graph_enabled  rY   r&   r^   r_   r`   Tqo_indptr_pkv_indptr_pkv_indices_plast_page_len_pqo_indptr_dkv_indptr_dkv_indices_dlast_page_len_drd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   c                    d}t          |          }|                    d          }t          |d                   }|                    | j        |          | _        |                    | j        |          | _        |                    | j        |          | _        |                    | j        |          | _        |                    d          }|                    d          }t          |||          }|||}||}t          |          }||}t          |          }|| _
        || _        | j        | j        | _        n;t          d||||j        ||t           |         j        |dk    |dk    d          | _        t          |          }|                    d          }t          |d                   }|                    | j        |          | _        |                    | j        |          | _        |                    | j        |          | _        |                    | j        |          | _        |                    d          } |                    d          }!t          | |!|          }"| j                            | j        | j        | j        || |"|||	|
|| j        ||d|ddd          | _        | j        d         }#|d	k    rd}#| j                            | j        | j        | j        ||||||	|
|| j        ||d|dd|#          | _        |j        | _         || _!        || _"        || _#        || _$        || _%        || _&        dS )
a  Plan POD's batch prefill and decode for given problem specification.

        Parameters
        ----------
        qo_indptr_p : torch.Tensor
            The prefill indptr of the query/output tensor, shape: ``[batch_size + 1]``.
        kv_indptr_p : torch.Tensor
            The prefill indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
        kv_indices_p : torch.Tensor
            The prefill page indices of the paged kv-cache, shape: ``[kv_indptr[-1]]``.
        last_page_len_p : torch.Tensor
            The number of entries in the last page of each prefill request in the paged
            kv-cache, shape: ``[batch_size]``.
        qo_indptr_d : torch.Tensor
            The decode indptr of the query/output tensor, shape: ``[batch_size + 1]``.
        kv_indptr_d : torch.Tensor
            The decode indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
        kv_indices_d : torch.Tensor
            The decode page indices of the paged kv-cache, shape: ``[kv_indptr[-1]]``.
        last_page_len_d : torch.Tensor
            The number of entries in the last page of each decode request in the paged
            kv-cache, shape: ``[batch_size]``.
        num_qo_heads : int
            The number of query/output heads
        num_kv_heads : int
            The number of key/value heads
        head_dim : int
            The dimension of the heads
        page_size : int
            The page size of the paged kv cache
        pos_encoding_mode : str
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Defaults to ``NONE``.
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        q_data_type : Optional[Union[str, torch.dtype]]
            The data type of the query tensor, defaults torch.float16.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to
            ``q_data_type``. Defaults to ``None``.
        data_type: Optional[Union[str, torch.dtype]]
            The data type of both the query and key/value tensors. Defaults to torch.float16.
            data_type is deprecated, please use q_data_type and kv_data_type instead.
        sm_scale : Optional[float]
            The scale used in softmax, if not provided, will be set to
            ``1.0 / sqrt(head_dim_qk)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.
        non_blocking : bool
            Whether to copy the input tensors to the device asynchronously, defaults to ``True``.

        Note
        ----
        The :meth:`plan` method should be called before any :meth:`run` or
        :meth:`run_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple run calls.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.

        The :meth:`plan` method cannot be used in Cuda Graph or in ``torch.compile``.
        rr   r:   r_   rs   Nrt   r   Fi   )'rG   rw   r   r9   _kv_indptr_buf_p_kv_indices_buf_p_kv_last_page_len_buf_p_qo_indptr_buf_pr   r   rx   ry   r=   rz   r   r8   r   r{   _kv_indptr_buf_d_kv_indices_buf_d_kv_last_page_len_buf_d_qo_indptr_buf_dr|   r   r   r   rX   _plan_info_dr   r   r   _plan_info_pr~   r   r   r   r   r   r   )$rQ   r   r   r   r   r   r   r   r   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   r   batch_size_pqo_indptr_host_ptotal_num_rows_pkv_indptr_host_plast_page_len_host_pkv_lens_arr_host_pbatch_size_dqo_indptr_host_dtotal_num_rows_dkv_indptr_host_dlast_page_len_host_dkv_lens_arr_host_dnum_colocated_ctass$                                       r$   r|   z$BatchPODWithPagedKVCacheWrapper.plan  s   |  ?++&>>%00/344 +t{ V V!-<!X!X'6'9'9Kl (: (
 (
$ !1 3 3Kl !4 !
 !
 '>>%00.11%88)2I
 
  "'#(.{;;&L/==#. $0!'"&"2D":! 128r!!## #D ?++&>>%00/344 +t{ V V!-<!X!X'6'9'9Kl (: (
 (
$ !1 3 3Kl !4 !
 !
 '>>%00.11%88)2I
 
 !/44*(3&'
 
, ".q1d""!" /44*(3&'
 
* (-"3' /!%%r&   Fr   paged_kv_cache_pr   r   r   r   r   r   r   r   
return_lser   r   c                    |t          |j                  }d}d}t          || j                  \  }}t	          ||| j        | j                   | j        }| j        }| j	        }| j
        }| j        }| j        }t          |           |d}|$|j        d         }dt          j        |          z  }|d}|d}|8|6t#          |                                                    d          d          }|t(          j        j        }n%|rt(          j        j        }nt(          j        j        }d}|rNt3          j        |                    d          |                    d	          ft2          j        |j        
          }t3          j        |          }t          || j                  \  }}t	          ||| j        | j                   | j        }| j        }| j	        }| j
        }| j        } | j        }!t          |           |d}|$|j        d         }dt          j        |          z  }|||z  }|	||	z  }| d} |!d}!d}"|rNt3          j        |                    d          |                    d	          ft2          j        |j        
          }"t3          j        |          }#t=          |j        |j        |j        |j        d         t@          |         j        |dk    |dk    || j!        t@          |         j        |dk    |dk              }$ |$j"        g | j#        | j$        | j%        |||| j&        | j'        | j(        | j)        |||tT          | j                 j        ||dtW          |j        d	         |j                  ||d|z  d|z  | j,        | j-        | j.        |||| j/        | j0        | j1        | j2        |#|"t(          j        j        tT          | j                 j        |ddtW          |j        d	         |j                  ||d| z  d|!z  || j3        R   |
|#|
z  }#|r||#f||"ffn||#fS )r   Nrr   r_   r   r   r   r   r   r	   r7   )4r   r9   r   r>   r   rx   ry   r   r   r   r   r   r   r   r\   r   r   r   r   r   r   r   r{   r   r   r@   rA   r   r   r   r)   r8   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )%rQ   r   r  r   r   r   r   r   r   r   r   r  r   r   r   r   	k_cache_p	v_cache_pr   r   r   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   s%                                        r$   r   z#BatchPODWithPagedKVCacheWrapper.run  s   2 +CJ77J ! 56FXX	9#D4d6O	
 	
 	
 #5) 1^
'' !4555$ #y}Hty222JLL$)=)E#+((**//33h$ $ $   +"//KK 8&o3&17 	K!chhqkk*%-
  E  %%  66FXX	9#D4d6O	
 	
 	
 #5) 1^
'' !4555$ #y}Hty222J'!J'!JLL 	K!chhqkk*%-
  E  %%,IOIIbM/06Q!!/06R!
 
  	!  1	
*1	
 (1	
 	1	

 1	
 1	
 1	
 !1	
 !1	
 "1	
 (1	
 1	
 1	
 1	
 )/1	
  !1	
" !#1	
$ %1	
& (	!cjAA'1	
( )1	
* +1	
, ,-1	
. ,/1	
2 *31	
4 (51	
6 71	
8 91	
: ;1	
< =1	
> !?1	
@ !A1	
B "C1	
D (E1	
F G1	
H I1	
J %K1	
L )/M1	
N O1	
P Q1	
R S1	
T (	!cjAAU1	
V W1	
X Y1	
Z ,[1	
\ ,]1	
^ _1	
`  a1	
 1	
 1	
 1	
f WE3=Q//E5>Qr&   c                     dS r   r   rW   s    r$   r   z+BatchPODWithPagedKVCacheWrapper.end_forward  r   r&   )r,   r   )	NNFNNNFFNr   )r   r   r   r   r
   r@   r   r   rS   r   r   rX   r   r   r   r8   r   r|   r   r   r   r   r   r&   r$   r   r   r  s       f fP  =% =% %=% =% 
	=% =% =% ^=%~ $t $ $ $ X$  "(9B:>7;$(&*&*!-V& V&\V& \V& l	V&
 V& \V& \V& lV& V& V& V& V& V& V& V&  eC$456!V&" uS%+%567#V&$ E#u{"234%V&& 5/'V&( UO)V&* UO+V&, -V&. 
/V& V& V& ^V&p M 157;#'#'#' &+%)%rR rR \rR  eEL%,4N.O OP	rR
 \rR  eEL%,4N.O OPrR  -rR 'u|4rR rR %rR %rR %rR  !rR"  $#rR$ TN%rR& 
elEL()eEL%,./u|U\7Q1RRS	U
'rR rR rR ^rRh     r&   r   ))r   	functoolsr   typesr   typingr   r   r   r   r   r@   api_loggingr
   jitr   r   pager   prefillr   quantizationr   utilsr   r   r   r   r   r   r   r   r   r   r   r   cacher%   r)   r+   r   r   r&   r$   <module>r     s          ! ! ! ! ! ! 4 4 4 4 4 4 4 4 4 4 4 4 4 4  ' ' ' ' ' ' 5 5 5 5 5 5 5 5       - - - - - - " " " " " "                             G G G
 M M M
v v v v v v v vr         r&   