
    )`i                        d Z ddlZddlmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ ej        d
             Ze	 edd          dej        dej        dej        dej        deej        ej        f         f
d                        Z ed          dej        dej        dej        dej        deej        ej        f         f
d            Ze	 edd          	 d'dej        dej        dej        dej        deej                 ddfd                        Z ed          	 d'dej        dej        dej        dej        deej                 ddfd            Ze	 edd          dej        dej        deej        ej        f         fd                        Z ed          dej        dej        deej        ej        f         fd             Z G d! d"          Z G d# d$          Z G d% d&          ZdS )(a3  
Copyright (c) 2023 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)ListOptionalTupleUnion   )flashinfer_api)"BatchDecodeWithPagedKVCacheWrapper)gen_cascade_module)#BatchPrefillWithPagedKVCacheWrappersingle_prefill_with_kv_cache)register_custom_opregister_fake_opc                  B    t                                                      S N)r
   build_and_load     f/home/jaya/work/projects/VOICE-AGENT/VIET/agent-env/lib/python3.11/site-packages/flashinfer/cascade.pyget_cascade_moduler      s    ..000r   zflashinfer::merge_stater   )mutates_argsv_as_av_bs_breturnc                 "   |                     t          j                  }|                     t          j                  }t          j        |           }t          j        |          }t	                                          | |||||           ||fS )a  Merge the attention output ``V`` and the logsumexp value ``S`` from the two
    KV-segments.
    Check :ref:`our tutorial <recursive-attention>` on the mathematical details.

    Parameters
    ----------
    v_a : torch.Tensor
        The attention output from the KV segment ``A``, shape:
        ``[seq_len, num_heads, head_dim]``.
    s_a : torch.Tensor
        The logsumexp value from the KV segment ``A``. expected to be a float32 tensor,
        shape: ``[seq_len, num_heads]``.
    v_b : torch.Tensor
        The attention output from the KV segment ``B``,
        shape: ``[seq_len, num_heads, head_dim]``.
    s_b : torch.Tensor
        The logsumexp value from the KV segment ``B``, expected to be a float32 tensor,
        shape: ``[seq_len, num_heads]``

    Returns
    -------
    V : torch.Tensor
        The merged attention output (equivalent to attention with merged KV-segment
        ``[A: B]``), shape: ``[seq_len, num_heads, head_dim]``.
    S : torch.Tensor
        The logsumexp value from the merged KV-segment ``[A: B]``, shape:
        ``[seq_len, num_heads]``.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> seq_len = 2048
    >>> num_heads = 32
    >>> head_dim = 128
    >>> va = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
    >>> sa = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
    >>> vb = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
    >>> sb = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
    >>> v_merged, s_merged = flashinfer.merge_state(va, sa, vb, sb)
    >>> v_merged.shape
    torch.Size([2048, 32, 128])
    >>> s_merged.shape
    torch.Size([2048, 32])
    )totorchfloat32
empty_liker   merge_state)r   r   r   r   v_mergeds_mergeds         r   r!   r!   "   sz    d &&

C
&&

C$$H$$H$$S#sC8LLLXr   c                 Z    t          j        |           }t          j        |          }||fS r   )r   r    )r   r   r   r   vss         r   _fake_merge_stater'   \   s-     	AAa4Kr   z flashinfer::merge_state_in_place)r%   r&   r%   r&   v_others_othermaskc                     |                     t          j                  }|                     t          j                  }t                                          | ||||           dS )a  Merge the self-attention state ``(v, s)`` with another state
    ``(v_other, s_other)`` in-place.

    Parameters
    ----------
    v : torch.Tensor
        The partial attention output to be updated in-place, shape:
        ``(seq_len, num_heads, head_dim)``.
    s : torch.Tensor
        The partial logsumexpr value to be updated in-place, expected to be a float32
        tensor, shape: ``(seq_len, num_heads)``.
    v_other : torch.Tensor
        The other attention output to be merged, shape:
        ``(seq_len, num_heads, head_dim)``.
    s_other : torch.Tensor
        The other logsumexp value to be merged, expected to be a float32 tensor,
        shape: ``(seq_len, num_heads)``.
    mask : Optional[torch.Tensor]
        The boolean mask tensor for whether to merge the state for a corresponding sequence
        or not. Useful for CUDA graphs. If not specified (default), will merge states for
        all sequences.
        shape: ``[seq_len]``

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> seq_len = 2048
    >>> num_heads = 32
    >>> head_dim = 128
    >>> v = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
    >>> s = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
    >>> v_other = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
    >>> s_other = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
    >>> flashinfer.merge_state_in_place(v, s, v_other, s_other)
    N)r   r   r   r   merge_state_in_placer%   r&   r(   r)   r*   s        r   r,   r,   e   sS    Z 	
U]Ajj''G--aGWdKKKKKr   c                     d S r   r   r-   s        r   _fake_merge_state_in_placer/      s	     	Dr   zflashinfer::merge_statesc                 P   | j         }|                    t          j                  }|                                 \  }}}}t          j        |||| j        |          }t          j        ||t          j        |          }t                                          | |||           ||fS )af  Merge multiple attention states (v, s).

    Parameters
    ----------
    v : torch.Tensor
        The attention output from the KV segments, shape:
        ``[seq_len, num_states, num_heads, head_dim]``.
    s : torch.Tensor
        The logsumexp value from the KV segments, shape:
        ``[seq_len, num_states, num_heads]``, expected
        to be a float32 tensor.

    Returns
    -------
    V : torch.Tensor
        The merged attention output, shape: ``[seq_len, num_heads, head_dim]``.
    S : torch.Tensor
        The logsumexp value from the merged KV-segments, shape:
        ``[seq_len, num_heads]``.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> seq_len = 2048
    >>> num_heads = 32
    >>> head_dim = 128
    >>> num_states = 100
    >>> v = torch.randn(seq_len, num_states, num_heads, head_dim).half().to("cuda:0")
    >>> s = torch.randn(seq_len, num_states, num_heads, dtype=torch.float32).to("cuda:0")
    >>> v_merged, s_merged = flashinfer.merge_states(v, s)
    >>> v_merged.shape
    torch.Size([2048, 32, 128])
    >>> s_merged.shape
    torch.Size([2048, 32])
    )dtypedevice)	r2   r   r   r   sizeemptyr1   r   merge_states)	r%   r&   r2   seq_len_	num_headshead_dimr"   r#   s	            r   r5   r5      s    N XF	U]A&'ffhh#GQ	8{7IxqwvVVVH{7IU]6RRRH%%aHh???Xr   c                     |                                  \  }}}}t          j        |||| j                  }t          j        ||t          j                  }||fS )N)r1   )r3   r   r4   r1   r   )r%   r&   r6   r7   r8   r9   r"   r#   s           r   _fake_merge_statesr;      sW     '(ffhh#GQ	8{7IxqwGGGH{7IU]CCCHXr   c            )          e Zd ZdZe	 	 	 	 	 	 d*dej        dedede	e
ej                          d	e	e
ej                          d
e	e
ej                          de	e
ej                          ddfd            Zedefd            Zdej        de
ej                 ddfdZe	 	 	 	 	 	 	 	 	 	 d+de
ej                 de
ej                 de
ej                 de
ej                 dededededededede	e         d ed!e	e         d"e	e         d#e	e         d$ed%e	eeej        f                  f$d&            ZeZed'ej        d(ej        fd)            ZeZdS ),!MultiLevelCascadeAttentionWrapperaJ  Attention wrapper for memory efficient multi-level cascade inference, this API assumes all
    levels KV-Cache are stored in a unified paged table.

    Please check :ref:`cascade-inference-data-layout` for data layout in cascade inference.
    Note that it's not always beneficial to increase the number of levels because of the overhead
    of merging attention results.

    The idea of cascade inference is introduced in our `blog post <https://flashinfer.ai/2024/02/02/cascade-inference.html>`_.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 8
    >>> head_dim = 128
    >>> page_size = 16
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> wrapper = flashinfer.MultiLevelCascadeAttentionWrapper(
    ...     2, workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> shared_kv_num_pages = 512
    >>> unique_kv_num_pages = 128
    >>> total_num_pages = shared_kv_num_pages + unique_kv_num_pages
    >>> shared_kv_page_indices = torch.arange(shared_kv_num_pages).int().to("cuda:0")
    >>> shared_kv_page_indptr = torch.tensor([0, shared_kv_num_pages], dtype=torch.int32, device="cuda:0")
    >>> unique_kv_page_indices = torch.arange(shared_kv_num_pages, total_num_pages).int().to("cuda:0")
    >>> unique_kv_page_indptr = torch.tensor(
    ...     [0, 17, 29, 44, 48, 66, 100, 128], dtype=torch.int32, device="cuda:0"
    ... )
    >>> shared_kv_last_page_len = torch.tensor([page_size], dtype=torch.int32, device="cuda:0")
    >>> # 1 <= kv_last_page_len <= page_size
    >>> unique_kv_last_page_len = torch.tensor(
    ...     [1, 7, 14, 4, 3, 1, 16], dtype=torch.int32, device="cuda:0"
    ... )
    >>> kv_cache_at_layer = [
    ...     torch.randn(
    ...         total_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> qo_indptr_arr = [
    ...     torch.tensor([0, batch_size], dtype=torch.int32, device="cuda:0"),  # top-level for shared KV-Cache
    ...     torch.arange(batch_size + 1, dtype=torch.int32, device="cuda:0")    # bottom-level for unique KV-Cache
    ... ]
    >>> # create auxiliary data structures for batch decode attention
    >>> wrapper.plan(
    ...     qo_indptr_arr,
    ...     [shared_kv_page_indptr, unique_kv_page_indptr],
    ...     [shared_kv_page_indices, unique_kv_page_indices],
    ...     [shared_kv_last_page_len, unique_kv_last_page_len],
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = torch.randn(batch_size, num_qo_heads, head_dim).half().to("cuda:0")
    ...     # compute batch decode attention, reuse auxiliary data structures for all layers
    ...     o = wrapper.run(q, kv_cache_at_layer[i])
    ...     outputs.append(o)
    ...
    >>> outputs[0].shape
    torch.Size([7, 64, 128])

    See Also
    --------
    BatchPrefillWithPagedKVCacheWrapper
    NHDFNfloat_workspace_buffer	kv_layoutuse_cuda_graphqo_indptr_buf_arrpaged_kv_indptr_buf_arrpaged_kv_indices_buf_arrpaged_kv_last_page_len_buf_arrr   c	                     || _         |r'fdt          ||||d          D             | _        n!fdt          |          D             | _        || _        | _        dS )a  Constructor of :class:`MultiLevelCascadeAttentionWrapper`.

        Parameters
        ----------
        num_levels : int
            The number of levels in the cascade attention.
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.
        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
        use_cuda_graph : bool
            Whether to use CUDA graph to capture the kernels, if enabled, the auxiliary data structures
            will be stored in provided buffers.
        qo_indptr_buf_arr : Optional[List[torch.Tensor]]
            An array of qo indptr buffers for each level, the array length should be equal to
            the number of levels.
            The last element of each tensor should be the total number of queries/outputs.
        paged_kv_indptr_buf_arr : Optional[List[torch.Tensor]]
            An array of paged kv-cache indptr buffers for each level, the array length should be
            equal to the number of levels.
        paged_kv_indices_buf_arr : Optional[List[torch.Tensor]]
            An array of paged kv-cache indices buffers for each level, the array length should be
            equal to the number of levels.
        paged_kv_last_page_len_buf_arr : Optional[List[torch.Tensor]]
            An array of paged kv-cache last page length buffers for each level, the array length
            should be equal to the number of levels.
        c                 F    g | ]\  }}}}t          d ||||          S )T)rA   qo_indptr_bufpaged_kv_indptr_bufpaged_kv_indices_bufpaged_kv_last_page_len_bufr   ).0rH   rI   rJ   rK   r?   r@   s        r   
<listcomp>z>MultiLevelCascadeAttentionWrapper.__init__.<locals>.<listcomp>Q  s[     , , ,!'(. 4*#'"/(;)=/I  , , ,r   Tstrictc                 0    g | ]}t                    S r   rL   )rM   r7   r?   r@   s     r   rN   z>MultiLevelCascadeAttentionWrapper.__init__.<locals>.<listcomp>i  s4     , , , 44JIVV, , ,r   N)_use_cuda_graphzip_batch_prefill_wrappersrange_num_levels
_kv_layout)	self
num_levelsr?   r@   rA   rB   rC   rD   rE   s	     ``     r   __init__z*MultiLevelCascadeAttentionWrapper.__init__&  s    R  . 	, , , , , %+,2  , , ,D((0, , , , ,z**, , ,D( &#r   c                     | j         S r   )rR   rX   s    r   is_cuda_graph_enabledz7MultiLevelCascadeAttentionWrapper.is_cuda_graph_enabledp  s    ##r   int_workspace_buffersc                 l    t          | j        |d          D ]\  }}|                    ||           dS )a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer, the device of the new float workspace buffer should
            be the same as the device of the input tensors.

        int_workspace_buffers : List[torch.Tensor]
            The array of new int workspace buffer, the device of the new int workspace buffer should
            be the same as the device of the input tensors.
        TrO   N)rS   rT   reset_workspace_buffer)rX   r?   r^   wrapperint_workspace_buffers        r   r`   z8MultiLevelCascadeAttentionWrapper.reset_workspace_buffert  s^    " .1(*?.
 .
 .
 	Y 	Y)G) **+ACWXXXX	Y 	Yr   NONEfloat16qo_indptr_arrpaged_kv_indptr_arrpaged_kv_indices_arrpaged_kv_last_page_lennum_qo_headsnum_kv_headsr9   	page_sizecausalpos_encoding_modeuse_fp16_qk_reductionsm_scalewindow_leftlogits_soft_cap
rope_scale
rope_thetaq_data_typekv_data_typec                     t          t          | j        ||||d                    D ]B\  }\  }}}}}|                    |||||||||| j        dz
  k    r|	nd|
||||||||           CdS )a  Create auxiliary data structures for multi-level cascade attention for multiple
        forward calls within the same decode step. Please check
        :ref:`cascade-inference-data-layout` for data layout in cascade inference.

        Parameters
        ----------
        qo_indptr_arr : List[torch.Tensor]
            An array of qo indptr tensors for each level, the array length should be equal to
            the number of levels.
            The last element of each tensor should be the total number of queries/outputs.
        paged_kv_indptr_arr : List[torch.Tensor]
            An array of paged kv-cache indptr tensors for each level, the array length should be
            equal to the number of levels.
        paged_kv_indices_arr : List[torch.Tensor]
            An array of paged kv-cache indices tensors for each level, the array length should be
            equal to the number of levels.
        paged_kv_last_page_len : List[torch.Tensor]
            An array of paged kv-cache last page length tensors for each level, the array length
            should be equal to the number of levels.
        num_qo_heads : int
            The number of query/output heads.
        num_kv_heads : int
            The number of key/value heads.
        head_dim : int
            The dimension of the heads.
        page_size : int
            The page size of the paged kv-cache.
        causal : bool
            Whether to apply causal mask to the attention matrix.
            This is only effective when :attr:`custom_mask` is not provided in
            :meth:`plan`.
        pos_encoding_mode : str
            The position encoding applied inside attention kernels, could be
            ``NONE``/``ROPE_LLAMA`` (LLAMA style rotary embedding) /``ALIBI``.
            Default is ``NONE``.
        use_fp16_qk_reduction : bool
            Whether to use f16 for qk reduction (faster at the cost of slight precision
            loss).
        window_left : int
            The left (inclusive) window size for the attention window, when set to ``-1``, the window
            size will be set to the full length of the sequence. Defaults to ``-1``.
        logits_soft_cap : Optional[float]
            The attention logits soft capping value (used in Gemini, Grok and Gemma-2, etc.), if not
            provided, will be set to ``0``. If greater than 0, the logits will be capped according to
            formula:
            :math:`\texttt{logits_soft_cap} \times \mathrm{tanh}(x / \texttt{logits_soft_cap})`,
            where :math:`x` is the input logits.
        sm_scale : Optional[float]
            The scale used in softmax, if not provided, will be set to
            ``1.0 / sqrt(head_dim)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.
        q_data_type : Optional[Union[str, torch.dtype]]
            The data type of the query tensor. If None, will be set to torch.float16.
        kv_data_type : Optional[Union[str, torch.dtype]]
            The data type of the key/value tensor. If None, will be set to :attr:`q_data_type`.
        TrO   r   F)
rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   N)	enumeraterS   rT   planrV   )rX   rf   rg   rh   ri   rj   rk   r9   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   ira   	qo_indptrpaged_kv_indptrpaged_kv_indicess                           r   ry   z&MultiLevelCascadeAttentionWrapper.plan  s    p ,#$&  	
 	
#	 #	 
A 
" LL &!"d&6&:!:!:vv"3&;!' /%%')%     !#	 #	r   qpaged_kv_cachec                     | j         d                             ||d          \  }}| j         dd         D ]/}|                    ||d          \  }}t          ||||           0|S )a  Compute multi-level cascade attention.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor, shape: ``[batch_size, num_qo_heads, head_dim]``.
        paged_kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            The paged KV-Cache stored as a tuple of tensors or a single tensor:

            * a tuple ``(k_cache, v_cache)`` of 4-D tensors, each with shape:
              ``[max_num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
              and ``[max_num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.

            * a single 5-D tensor with shape:
              ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
              :attr:`kv_layout` is ``NHD``, and
              ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
              ``paged_kv_cache[:, 1]`` is the value-cache.
        rd   T)
return_lseN)rT   runr,   )rX   r~   r   outlsera   out_ilse_is           r   r   z%MultiLevelCascadeAttentionWrapper.run  s    4 /377 8 
 
S
 3CRC8 	9 	9G";;q.T;JJLE5 c5%8888
r   )r>   FNNNN)
Frc   FNrd   NNNre   N)__name__
__module____qualname____doc__r   r   Tensorstrboolr   r   rZ   propertyr]   r`   intfloatr   r1   ry   begin_forwardr   forwardr   r   r   r=   r=      s       G GR 
 $:>@DAEGKG$ G$ !&G$ 	G$
 G$ $D$67G$ "*$u|*<!=G$ #+4+=">G$ )1el1C(DG$ 
G$ G$ G$ ^G$R $t $ $ $ X$Y %Y  $EL1Y 
	Y Y Y Y,  !'&+$(+/&*&*$:>'t tEL)t "%,/t #5<0	t
 !%U\ 2t t t t t t t  $t 5/t t "%t  UO!t" UO#t$ %t& uS%+%567't t t ^tl M"<" " " " ^"H GGGr   r=   c                   2   e Zd ZdZe	 ddej        deddfd            Zdej        ddfdZ	e	 dd
ej        dej        dej        de
de
de
de
deddfd            Zedej        dej        dej        dej        dej        f
d            Zedd            ZdS ).BatchDecodeWithSharedPrefixPagedKVCacheWrappera  Wrapper class for decode attention with shared-prefix paged kv-cache for batch
    of requests. The shared-prefix KV-Cache was stored in a standalone tensors, and the
    unique KV-Cache of each request was stored in a paged KV-Cache data structure.

    Check :ref:`our tutorial<kv-layout>` for page table layout.

    Warning
    -------
    This API will be deprecated in the future, please use
    :class:`MultiLevelCascadeAttentionWrapper` instead.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 8
    >>> head_dim = 128
    >>> max_num_pages = 128
    >>> page_size = 16
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> wrapper = flashinfer.BatchDecodeWithSharedPrefixPagedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> shared_prefix_len = 8192
    >>> unique_kv_page_indices = torch.arange(max_num_pages).int().to("cuda:0")
    >>> unique_kv_page_indptr = torch.tensor(
    ...     [0, 17, 29, 44, 48, 66, 100, 128], dtype=torch.int32, device="cuda:0"
    ... )
    >>> # 1 <= kv_last_page_len <= page_size
    >>> unique_kv_last_page_len = torch.tensor(
    ...     [1, 7, 14, 4, 3, 1, 16], dtype=torch.int32, device="cuda:0"
    ... )
    >>> unique_kv_cache_at_layer = [
    ...     torch.randn(
    ...         max_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> shared_k_data_at_layer = [
    ...     torch.randn(
    ...         shared_prefix_len, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> shared_v_data_at_layer = [
    ...     torch.randn(
    ...         shared_prefix_len, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> # create auxiliary data structures for batch decode attention
    >>> wrapper.begin_forward(
    ...     unique_kv_page_indptr,
    ...     unique_kv_page_indices,
    ...     unique_kv_last_page_len,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ...     data_type=torch.float16
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = torch.randn(batch_size, num_qo_heads, head_dim).half().to("cuda:0")
    ...     k_shared = shared_k_data_at_layer[i]
    ...     v_shared = shared_v_data_at_layer[i]
    ...     unique_kv_cache = unique_kv_cache_at_layer[i]
    ...     # compute batch decode attention, reuse auxiliary data structures for all layers
    ...     o = wrapper.forward(q, k_shared, v_shared, unique_kv_cache)
    ...     outputs.append(o)
    ...
    >>> outputs[0].shape
    torch.Size([7, 64, 128])

    Note
    ----
    To accelerate computation, FlashInfer's shared prefix batch decode attention creates
    some auxiliary data structures, these data structures can be reused across multiple
    batch decode attention calls (e.g. different Transformer layers). This wrapper class
    manages the lifecycle of these data structures.
    r>   r?   r@   r   Nc                 >    t          ||          | _        || _        d S r   )r	   _batch_decode_wrapperrW   rX   r?   r@   s      r   rZ   z7BatchDecodeWithSharedPrefixPagedKVCacheWrapper.__init__  s)     &H"I&
 &
" $r   c                 <    | j                             ||           dS a  Reset the workspace buffer.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The new float workspace buffer, the device of the new float workspace buffer should
            be the same as the device of the input tensors.

        int_workspace_buffer : torch.Tensor
            The new int workspace buffer, the device of the new int workspace buffer should
            be the same as the device of the input tensors.
        N)r   r`   rX   r?   rb   s      r   r`   zEBatchDecodeWithSharedPrefixPagedKVCacheWrapper.reset_workspace_buffer  s/     	"99"$8	
 	
 	
 	
 	
r   re   unique_kv_indptrunique_kv_indicesunique_kv_last_page_lenrj   rk   r9   rl   	data_typec	                 L    | j                             |||||||d|	  	         dS )a  Plan shared-prefix batch decode attention for given problem specification.

        Parameters
        ----------
        indptr : torch.Tensor
            The indptr of the paged kv cache, shape: ``[batch_size + 1]``
        indices : torch.Tensor
            The page indices of the paged kv cache, shape: ``[qo_indptr[-1]]``
        last_page_len : torch.Tensor
            The number of entries in the last page of each request in the paged kv
            cache, shape: ``[batch_size]``
        num_qo_heads : int
            The number of query/output heads
        num_kv_heads : int
            The number of key/value heads
        head_dim : int
            The dimension of the heads
        page_size : int
            The page size of the paged kv cache
        data_type : Union[str, torch.dtype]
            The data type of the paged kv cache

        Note
        ----
        The :meth:`begin_forward` method should be called before any :meth:`forward` or
        :meth:`forward_return_lse` calls,
        auxiliary data structures will be created during this call and cached for
        multiple forward calls.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.


        See Also
        --------
        MultiLevelCascadeAttentionWrapper
        rc   )rn   r   N)r   r   )	rX   r   r   r   rj   rk   r9   rl   r   s	            r   r   z<BatchDecodeWithSharedPrefixPagedKVCacheWrapper.begin_forward  sJ    d 	"00#$ 	1 
	
 
	
 
	
 
	
 
	
r   r~   k_sharedv_sharedunique_kv_cachec                     t          |||dd| j        | j        j        | j        j        | j        j        d
  
        \  }}| j                            ||d          \  }}t          ||||           |S )a  Compute batch decode attention between queries and shared-prefix paged
        kv-cache.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor, shape: ``[batch_size, num_qo_heads, head_dim]``.
        k_shared : torch.Tensor
            The shared prefix key tensor, shape:
            ``[shared_prefix_len, num_kv_heads, head_dim]`` if :attr:`kv_layout` is
            ``NHD``, or ``[num_kv_heads, shared_prefix_len, head_dim]`` if
            :attr:`kv_layout` is ``HND``.
        v_shared : torch.Tensor
            The shared prefix value tensor, shape:
            ``[shared_prefix_len, num_kv_heads, head_dim]`` if :attr:`kv_layout` is
            ``NHD``, or ``[num_kv_heads, shared_prefix_len, head_dim]`` if
            :attr:`kv_layout` is ``HND``.
        unique_kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            The request-independent suffix paged KV-Cache stored as a tuple of tensors or a single tensor:

            * a tuple ``(k_cache, v_cache)`` of 4-D tensors, each with shape:
              ``[max_num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
              and ``[max_num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.

            * a single 5-D tensor with shape:
              ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
              :attr:`kv_layout` is ``NHD``, and
              ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
              ``paged_kv_cache[:, 1]`` is the value-cache.

        Returns
        -------
        V : torch.Tensor
            The attention output, shape: ``[batch_size, num_heads, head_dim]``
        Frc   T)rm   rn   r@   rp   rs   rt   r   )rn   )r   rW   r   	_sm_scale_rope_scale_rope_thetaforward_return_lser,   )	rX   r~   r   r   r   V_sharedS_sharedV_uniqueS_uniques	            r   r   z6BatchDecodeWithSharedPrefixPagedKVCacheWrapper.forward  s    X :$o/91=1=
 
 
( "7JJ$ K 
 
(
 	Xx8DDDr   c                     dS z6Warning: this function is deprecated and has no effectNr   r\   s    r   end_forwardz:BatchDecodeWithSharedPrefixPagedKVCacheWrapper.end_forward  	     	r   r>   )re   r   N)r   r   r   r   r   r   r   r   rZ   r`   r   r   r   r   r   r   r   r   r   +  s       Q Qf EJ$ $&+l$?B$	$ $ $ ^$
&+l
	
 
 
 
&  #;
 ;
,;
 !<;
 "'	;

 ;
 ;
 ;
 ;
 ;
 
;
 ;
 ;
 ^;
z =<= ,= ,	=
 = 
= = = ^=~    ^  r   r   c                      e Zd ZdZe	 ddej        deddfd            Zdej        dej        ddfd	Z	ed
ej        dej        dej        dej        de
de
de
de
ddfd            Ze	 	 	 	 	 d dej        dej        dej        dej        dededee         dee         dee         dej        fd            Zed!d            ZdS )"/BatchPrefillWithSharedPrefixPagedKVCacheWrappera,  Wrapper class for prefill/append attention with shared-prefix paged kv-cache for
    batch of requests.

    Check :ref:`our tutorial<kv-layout>` for paged kv-cache layout.

    Warning
    -------
    This API will be deprecated in the future, please use
    :class:`MultiLevelCascadeAttentionWrapper` instead.

    Example
    -------
    >>> import torch
    >>> import flashinfer
    >>> num_layers = 32
    >>> num_qo_heads = 64
    >>> num_kv_heads = 16
    >>> head_dim = 128
    >>> max_num_pages = 128
    >>> page_size = 16
    >>> # allocate 128MB workspace buffer
    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
    >>> prefill_wrapper = flashinfer.BatchPrefillWithSharedPrefixPagedKVCacheWrapper(
    ...     workspace_buffer, "NHD"
    ... )
    >>> batch_size = 7
    >>> shared_prefix_len = 8192
    >>> nnz_qo = 100
    >>> qo_indptr = torch.tensor(
    ...     [0, 33, 44, 55, 66, 77, 88, nnz_qo], dtype=torch.int32, device="cuda:0"
    ... )
    >>> paged_kv_indices = torch.arange(max_num_pages).int().to("cuda:0")
    >>> paged_kv_indptr = torch.tensor(
    ...     [0, 17, 29, 44, 48, 66, 100, 128], dtype=torch.int32, device="cuda:0"
    ... )
    >>> # 1 <= paged_kv_last_page_len <= page_size
    >>> paged_kv_last_page_len= torch.tensor(
    ...     [1, 7, 14, 4, 3, 1, 16], dtype=torch.int32, device="cuda:0"
    ... )
    >>> kv_cache_at_layer = [
    ...     torch.randn(
    ...         max_num_pages, 2, page_size, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> shared_k_data_at_layer = [
    ...     torch.randn(
    ...         shared_prefix_len, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> shared_v_data_at_layer = [
    ...     torch.randn(
    ...         shared_prefix_len, num_kv_heads, head_dim, dtype=torch.float16, device="cuda:0"
    ...     ) for _ in range(num_layers)
    ... ]
    >>> # create auxiliary data structures for batch prefill attention
    >>> prefill_wrapper.begin_forward(
    ...     qo_indptr,
    ...     paged_kv_indptr,
    ...     paged_kv_indices,
    ...     paged_kv_last_page_len,
    ...     num_qo_heads,
    ...     num_kv_heads,
    ...     head_dim,
    ...     page_size,
    ... )
    >>> outputs = []
    >>> for i in range(num_layers):
    ...     q = torch.randn(nnz_qo, num_qo_heads, head_dim).half().to("cuda:0")
    ...     kv_cache = kv_cache_at_layer[i]
    ...     k_shared = shared_k_data_at_layer[i]
    ...     v_shared = shared_v_data_at_layer[i]
    ...     # compute batch prefill attention, reuse auxiliary data structures
    ...     o = prefill_wrapper.forward(
    ...         q, k_shared, v_shared, kv_cache, causal=True
    ...     )
    ...     outputs.append(o)
    ...
    s[0].shape>>> # clear auxiliary data structures
    >>> prefill_wrapper.end_forward()
    >>> outputs[0].shape
    torch.Size([100, 64, 128])

    Note
    ----
    To accelerate computation, FlashInfer's shared-prefix batch prefill/append attention
    operators creates some auxiliary data structures, these data structures can be
    reused across multiple prefill/append attention calls (e.g. different Transformer
    layers). This wrapper class manages the lifecycle of these data structures.
    r>   r?   r@   r   Nc                 >    t          ||          | _        || _        dS )a  Constructor of :class:`BatchDecodeWithSharedPrefixPagedKVCacheWrapper`.

        Parameters
        ----------
        float_workspace_buffer : torch.Tensor
            The user reserved float workspace buffer used to store intermediate attention results
            in the split-k algorithm. The recommended size is 128MB, the device of the workspace
            buffer should be the same as the device of the input tensors.
        kv_layout : str
            The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
        N)r   _batch_prefill_wrapperrW   r   s      r   rZ   z8BatchPrefillWithSharedPrefixPagedKVCacheWrapper.__init__z  s)     'J"I'
 '
# $r   rb   c                 <    | j                             ||           dS r   )r   r`   r   s      r   r`   zFBatchPrefillWithSharedPrefixPagedKVCacheWrapper.reset_workspace_buffer  s/     	#::"$8	
 	
 	
 	
 	
r   r{   r|   r}   ri   rj   rk   r9   rl   c	           
      H    | j                             ||||||||           dS )a  Create auxiliary data structures for shared-prefix batch prefill/append
        attention for multiple forward calls within the same prefill/append step.

        Parameters
        ----------
        qo_indptr : torch.Tensor
            The indptr of the query/output tensor, shape: ``[batch_size + 1]``.
        paged_kv_indptr : torch.Tensor
            The indptr of the paged kv-cache, shape: ``[batch_size + 1]``.
        paged_kv_indices : torch.Tensor
            The page indices of the paged kv-cache, shape: ``[qo_indptr[-1]]``.
        paged_kv_last_page_len : torch.Tensor
            The number of entries in the last page of each request in the paged
            kv-cache, shape: ``[batch_size]``.
        num_qo_heads : int
            The number of query/output heads.
        num_kv_heads : int
            The number of key/value heads.
        head_dim : int
            The dimension of the heads.
        page_size : int
            The page size of the paged kv-cache.

        Note
        ----
        The :meth:`begin_forward` method should be called before any :meth:`forward`
        or :meth:`forward_return_lse` calls, auxiliary data structures will be created
        during this call and cached for multiple forward calls.

        The ``num_qo_heads`` must be a multiple of ``num_kv_heads``. If ``num_qo_heads``
        is not equal to ``num_kv_heads``, the function will use
        `grouped query attention <https://arxiv.org/abs/2305.13245>`_.
        N)r   r   )	rX   r{   r|   r}   ri   rj   rk   r9   rl   s	            r   r   z=BatchPrefillWithSharedPrefixPagedKVCacheWrapper.begin_forward  sB    Z 	#11"		
 		
 		
 		
 		
r   Fr~   r   r   r   rm   ro   rp   rs   rt   c
                     t          |||dd| j        ||||	d          \  }
}| j                            |||d||||	          \  }}t	          |
|||           |
S )a	  Compute batch prefill/append attention between query and shared-prefix paged
        kv-cache.

        Parameters
        ----------
        q : torch.Tensor
            The query tensor, shape: ``[qo_indptr[-1], num_qo_heads, head_dim]``.
        k_shared : torch.Tensor
            The shared prefix key tensor, shape:
            ``[shared_prefix_len, num_kv_heads, head_dim]`` if :attr:`kv_layout` is
            ``NHD``, or ``[num_kv_heads, shared_prefix_len, head_dim]`` if
            :attr:`kv_layout` is ``HND``.
        v_shared ; torch.Tensor
            The shared prefix value tensor, shape:
            ``[shared_prefix_len, num_kv_heads, head_dim]`` if :attr:`kv_layout` is
            ``NHD``, or ``[num_kv_heads, shared_prefix_len, head_dim]`` if
            :attr:`kv_layout` is ``HND``.
        unique_kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
            The request-independent suffix paged KV-Cache stored as a tuple of tensors or a single tensor:

            * a tuple ``(k_cache, v_cache)`` of 4-D tensors, each with shape:
              ``[max_num_pages, page_size, num_kv_heads, head_dim]`` if :attr:`kv_layout` is ``NHD``,
              and ``[max_num_pages, num_kv_heads, page_size, head_dim]`` if :attr:`kv_layout` is ``HND``.

            * a single 5-D tensor with shape:
              ``[max_num_pages, 2, page_size, num_kv_heads, head_dim]`` if
              :attr:`kv_layout` is ``NHD``, and
              ``[max_num_pages, 2, num_kv_heads, page_size, head_dim]`` if
              :attr:`kv_layout` is ``HND``. Where ``paged_kv_cache[:, 0]`` is the key-cache and
              ``paged_kv_cache[:, 1]`` is the value-cache.

        causal : bool
            Whether to apply causal mask on the attention matrix.
        use_fp16_qk_reduction : bool
            Whether to use f16 for qk reduction (faster at the cost of slight precision
            loss).
        sm_scale : Optional[float]
            The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
        rope_scale : Optional[float]
            The scale used in RoPE interpolation, if not provided, will be set to
            ``1.0``.
        rope_theta : Optional[float]
            The theta used in RoPE, if not provided, will be set to ``1e4``.

        Returns
        -------
        V : torch.Tensor
            The attention output, shape: ``[qo_indptr[-1], num_heads, head_dim]``.

        See Also
        --------
        MultiLevelCascadeAttentionWrapper
        Frc   T)rm   rn   r@   ro   rp   rs   rt   r   )rm   rn   ro   rp   rs   rt   )r   rW   r   r   r,   )rX   r~   r   r   r   rm   ro   rp   rs   rt   r   r   r   r   s                 r   r   z7BatchPrefillWithSharedPrefixPagedKVCacheWrapper.forward  s    D :$o"7!!
 
 
( "8KK$"7!! L 	
 	
( 	Xx8DDDr   c                     dS r   r   r\   s    r   r   z;BatchPrefillWithSharedPrefixPagedKVCacheWrapper.end_forward5  r   r   r   )FFNNNr   )r   r   r   r   r   r   r   r   rZ   r`   r   r   r   r   r   r   r   r   r   r   r   r     s       X Xt EJ$ $&+l$?B$	$ $ $ ^$&
&+l
JO,
	
 
 
 
& 5
<5
 5
  ,	5

 !&5
 5
 5
 5
 5
 
5
 5
 5
 ^5
n  &+$(&*&*Y Y<Y ,Y ,	Y
 Y Y  $Y 5/Y UOY UOY 
Y Y Y ^Yv    ^  r   r   r   ) r   	functoolstypingr   r   r   r   r   api_loggingr   decoder	   jit.cascader
   prefillr   r   utilsr   r   cacher   r   r!   r'   r,   r/   r5   r;   r=   r   r   r   r   r   <module>r      s&         / / / / / / / / / / / /  ' ' ' ' ' ' 6 6 6 6 6 6 + + + + + + V V V V V V V V 7 7 7 7 7 7 7 7 1 1 1 -B???5	5!L5/4|5BG,5
5<%&5 5 5 @? 5p +,,	!L/4|BG,
5<%&   -, 6ZPPP $(-L -L|-L|-L \-L \	-L
 5<
 -L 
-L -L -L QP -L` 455 $(	 	|	|	 \	 \		
 5<
 	 
	 	 	 65	 .R@@@+EL +U\ +eEL%,<V6W + + + A@ +\ ,--|
5<%&   .-L L L L L L L L^
q q q q q q q qhY Y Y Y Y Y Y Y Y Yr   